diff options
Diffstat (limited to 'src/pkg/bufio')
-rw-r--r-- | src/pkg/bufio/bufio.go | 100 | ||||
-rw-r--r-- | src/pkg/bufio/bufio_test.go | 310 | ||||
-rw-r--r-- | src/pkg/bufio/example_test.go | 74 | ||||
-rw-r--r-- | src/pkg/bufio/export_test.go | 27 | ||||
-rw-r--r-- | src/pkg/bufio/scan.go | 338 | ||||
-rw-r--r-- | src/pkg/bufio/scan_test.go | 370 |
6 files changed, 1193 insertions, 26 deletions
diff --git a/src/pkg/bufio/bufio.go b/src/pkg/bufio/bufio.go index b44d0e7d1..ee69c2d31 100644 --- a/src/pkg/bufio/bufio.go +++ b/src/pkg/bufio/bufio.go @@ -64,6 +64,8 @@ func NewReader(rd io.Reader) *Reader { return NewReaderSize(rd, defaultBufSize) } +var errNegativeRead = errors.New("bufio: reader returned negative count from Read") + // fill reads a new chunk into the buffer. func (b *Reader) fill() { // Slide existing data to beginning. @@ -74,10 +76,13 @@ func (b *Reader) fill() { } // Read new data. - n, e := b.rd.Read(b.buf[b.w:]) + n, err := b.rd.Read(b.buf[b.w:]) + if n < 0 { + panic(errNegativeRead) + } b.w += n - if e != nil { - b.err = e + if err != nil { + b.err = err } } @@ -272,6 +277,9 @@ func (b *Reader) ReadSlice(delim byte) (line []byte, err error) { panic("not reached") } +// ReadLine is a low-level line-reading primitive. Most callers should use +// ReadBytes('\n') or ReadString('\n') instead or use a Scanner. +// // ReadLine tries to return a single line, not including the end-of-line bytes. // If the line was too long for the buffer then isPrefix is set and the // beginning of the line is returned. The rest of the line will be returned @@ -279,6 +287,9 @@ func (b *Reader) ReadSlice(delim byte) (line []byte, err error) { // of the line. The returned buffer is only valid until the next call to // ReadLine. ReadLine either returns a non-nil line or it returns an error, // never both. +// +// The text returned from ReadLine does not include the line end ("\r\n" or "\n"). +// No indication or error is given if the input ends without a final line end. func (b *Reader) ReadLine() (line []byte, isPrefix bool, err error) { line, err = b.ReadSlice('\n') if err == ErrBufferFull { @@ -320,6 +331,7 @@ func (b *Reader) ReadLine() (line []byte, isPrefix bool, err error) { // it returns the data read before the error and the error itself (often io.EOF). // ReadBytes returns err != nil if and only if the returned data does not end in // delim. +// For simple uses, a Scanner may be more convenient. func (b *Reader) ReadBytes(delim byte) (line []byte, err error) { // Use ReadSlice to look for array, // accumulating full buffers. @@ -367,9 +379,45 @@ func (b *Reader) ReadBytes(delim byte) (line []byte, err error) { // it returns the data read before the error and the error itself (often io.EOF). // ReadString returns err != nil if and only if the returned data does not end in // delim. +// For simple uses, a Scanner may be more convenient. func (b *Reader) ReadString(delim byte) (line string, err error) { - bytes, e := b.ReadBytes(delim) - return string(bytes), e + bytes, err := b.ReadBytes(delim) + return string(bytes), err +} + +// WriteTo implements io.WriterTo. +func (b *Reader) WriteTo(w io.Writer) (n int64, err error) { + n, err = b.writeBuf(w) + if err != nil { + return + } + + if r, ok := b.rd.(io.WriterTo); ok { + m, err := r.WriteTo(w) + n += m + return n, err + } + + for b.fill(); b.r < b.w; b.fill() { + m, err := b.writeBuf(w) + n += m + if err != nil { + return n, err + } + } + + if b.err == io.EOF { + b.err = nil + } + + return n, b.readErr() +} + +// writeBuf writes the Reader's buffer to the writer. +func (b *Reader) writeBuf(w io.Writer) (int64, error) { + n, err := w.Write(b.buf[b.r:b.w]) + b.r += n + return int64(n), err } // buffered output @@ -415,17 +463,17 @@ func (b *Writer) Flush() error { if b.n == 0 { return nil } - n, e := b.wr.Write(b.buf[0:b.n]) - if n < b.n && e == nil { - e = io.ErrShortWrite + n, err := b.wr.Write(b.buf[0:b.n]) + if n < b.n && err == nil { + err = io.ErrShortWrite } - if e != nil { + if err != nil { if n > 0 && n < b.n { copy(b.buf[0:b.n-n], b.buf[n:b.n]) } b.n -= n - b.err = e - return e + b.err = err + return err } b.n = 0 return nil @@ -529,6 +577,36 @@ func (b *Writer) WriteString(s string) (int, error) { return nn, nil } +// ReadFrom implements io.ReaderFrom. +func (b *Writer) ReadFrom(r io.Reader) (n int64, err error) { + if b.Buffered() == 0 { + if w, ok := b.wr.(io.ReaderFrom); ok { + return w.ReadFrom(r) + } + } + var m int + for { + m, err = r.Read(b.buf[b.n:]) + if m == 0 { + break + } + b.n += m + n += int64(m) + if b.Available() == 0 { + if err1 := b.Flush(); err1 != nil { + return n, err1 + } + } + if err != nil { + break + } + } + if err == io.EOF { + err = nil + } + return n, err +} + // buffered input and output // ReadWriter stores pointers to a Reader and a Writer. diff --git a/src/pkg/bufio/bufio_test.go b/src/pkg/bufio/bufio_test.go index a43cbd23a..b0e811443 100644 --- a/src/pkg/bufio/bufio_test.go +++ b/src/pkg/bufio/bufio_test.go @@ -28,9 +28,9 @@ func newRot13Reader(r io.Reader) *rot13Reader { } func (r13 *rot13Reader) Read(p []byte) (int, error) { - n, e := r13.r.Read(p) - if e != nil { - return n, e + n, err := r13.r.Read(p) + if err != nil { + return n, err } for i := 0; i < n; i++ { c := p[i] | 0x20 // lowercase byte @@ -48,15 +48,15 @@ func readBytes(buf *Reader) string { var b [1000]byte nb := 0 for { - c, e := buf.ReadByte() - if e == io.EOF { + c, err := buf.ReadByte() + if err == io.EOF { break } - if e == nil { + if err == nil { b[nb] = c nb++ - } else if e != iotest.ErrTimeout { - panic("Data: " + e.Error()) + } else if err != iotest.ErrTimeout { + panic("Data: " + err.Error()) } } return string(b[0:nb]) @@ -93,12 +93,12 @@ var readMakers = []readMaker{ func readLines(b *Reader) string { s := "" for { - s1, e := b.ReadString('\n') - if e == io.EOF { + s1, err := b.ReadString('\n') + if err == io.EOF { break } - if e != nil && e != iotest.ErrTimeout { - panic("GetLines: " + e.Error()) + if err != nil && err != iotest.ErrTimeout { + panic("GetLines: " + err.Error()) } s += s1 } @@ -110,9 +110,9 @@ func reads(buf *Reader, m int) string { var b [1000]byte nb := 0 for { - n, e := buf.Read(b[nb : nb+m]) + n, err := buf.Read(b[nb : nb+m]) nb += n - if e == io.EOF { + if err == io.EOF { break } } @@ -748,7 +748,7 @@ func testReadLineNewlines(t *testing.T, input string, expect []readLineResult) { b := NewReaderSize(strings.NewReader(input), minReadBufferSize) for i, e := range expect { line, isPrefix, err := b.ReadLine() - if bytes.Compare(line, e.line) != 0 { + if !bytes.Equal(line, e.line) { t.Errorf("%q call %d, line == %q, want %q", input, i, line, e.line) return } @@ -762,3 +762,283 @@ func testReadLineNewlines(t *testing.T, input string, expect []readLineResult) { } } } + +func createTestInput(n int) []byte { + input := make([]byte, n) + for i := range input { + // 101 and 251 are arbitrary prime numbers. + // The idea is to create an input sequence + // which doesn't repeat too frequently. + input[i] = byte(i % 251) + if i%101 == 0 { + input[i] ^= byte(i / 101) + } + } + return input +} + +func TestReaderWriteTo(t *testing.T) { + input := createTestInput(8192) + r := NewReader(onlyReader{bytes.NewBuffer(input)}) + w := new(bytes.Buffer) + if n, err := r.WriteTo(w); err != nil || n != int64(len(input)) { + t.Fatalf("r.WriteTo(w) = %d, %v, want %d, nil", n, err, len(input)) + } + + for i, val := range w.Bytes() { + if val != input[i] { + t.Errorf("after write: out[%d] = %#x, want %#x", i, val, input[i]) + } + } +} + +type errorWriterToTest struct { + rn, wn int + rerr, werr error + expected error +} + +func (r errorWriterToTest) Read(p []byte) (int, error) { + return len(p) * r.rn, r.rerr +} + +func (w errorWriterToTest) Write(p []byte) (int, error) { + return len(p) * w.wn, w.werr +} + +var errorWriterToTests = []errorWriterToTest{ + {1, 0, nil, io.ErrClosedPipe, io.ErrClosedPipe}, + {0, 1, io.ErrClosedPipe, nil, io.ErrClosedPipe}, + {0, 0, io.ErrUnexpectedEOF, io.ErrClosedPipe, io.ErrClosedPipe}, + {0, 1, io.EOF, nil, nil}, +} + +func TestReaderWriteToErrors(t *testing.T) { + for i, rw := range errorWriterToTests { + r := NewReader(rw) + if _, err := r.WriteTo(rw); err != rw.expected { + t.Errorf("r.WriteTo(errorWriterToTests[%d]) = _, %v, want _,%v", i, err, rw.expected) + } + } +} + +func TestWriterReadFrom(t *testing.T) { + ws := []func(io.Writer) io.Writer{ + func(w io.Writer) io.Writer { return onlyWriter{w} }, + func(w io.Writer) io.Writer { return w }, + } + + rs := []func(io.Reader) io.Reader{ + iotest.DataErrReader, + func(r io.Reader) io.Reader { return r }, + } + + for ri, rfunc := range rs { + for wi, wfunc := range ws { + input := createTestInput(8192) + b := new(bytes.Buffer) + w := NewWriter(wfunc(b)) + r := rfunc(bytes.NewBuffer(input)) + if n, err := w.ReadFrom(r); err != nil || n != int64(len(input)) { + t.Errorf("ws[%d],rs[%d]: w.ReadFrom(r) = %d, %v, want %d, nil", wi, ri, n, err, len(input)) + continue + } + if got, want := b.String(), string(input); got != want { + t.Errorf("ws[%d], rs[%d]:\ngot %q\nwant %q\n", wi, ri, got, want) + } + } + } +} + +type errorReaderFromTest struct { + rn, wn int + rerr, werr error + expected error +} + +func (r errorReaderFromTest) Read(p []byte) (int, error) { + return len(p) * r.rn, r.rerr +} + +func (w errorReaderFromTest) Write(p []byte) (int, error) { + return len(p) * w.wn, w.werr +} + +var errorReaderFromTests = []errorReaderFromTest{ + {0, 1, io.EOF, nil, nil}, + {1, 1, io.EOF, nil, nil}, + {0, 1, io.ErrClosedPipe, nil, io.ErrClosedPipe}, + {0, 0, io.ErrClosedPipe, io.ErrShortWrite, io.ErrClosedPipe}, + {1, 0, nil, io.ErrShortWrite, io.ErrShortWrite}, +} + +func TestWriterReadFromErrors(t *testing.T) { + for i, rw := range errorReaderFromTests { + w := NewWriter(rw) + if _, err := w.ReadFrom(rw); err != rw.expected { + t.Errorf("w.ReadFrom(errorReaderFromTests[%d]) = _, %v, want _,%v", i, err, rw.expected) + } + } +} + +// TestWriterReadFromCounts tests that using io.Copy to copy into a +// bufio.Writer does not prematurely flush the buffer. For example, when +// buffering writes to a network socket, excessive network writes should be +// avoided. +func TestWriterReadFromCounts(t *testing.T) { + var w0 writeCountingDiscard + b0 := NewWriterSize(&w0, 1234) + b0.WriteString(strings.Repeat("x", 1000)) + if w0 != 0 { + t.Fatalf("write 1000 'x's: got %d writes, want 0", w0) + } + b0.WriteString(strings.Repeat("x", 200)) + if w0 != 0 { + t.Fatalf("write 1200 'x's: got %d writes, want 0", w0) + } + io.Copy(b0, onlyReader{strings.NewReader(strings.Repeat("x", 30))}) + if w0 != 0 { + t.Fatalf("write 1230 'x's: got %d writes, want 0", w0) + } + io.Copy(b0, onlyReader{strings.NewReader(strings.Repeat("x", 9))}) + if w0 != 1 { + t.Fatalf("write 1239 'x's: got %d writes, want 1", w0) + } + + var w1 writeCountingDiscard + b1 := NewWriterSize(&w1, 1234) + b1.WriteString(strings.Repeat("x", 1200)) + b1.Flush() + if w1 != 1 { + t.Fatalf("flush 1200 'x's: got %d writes, want 1", w1) + } + b1.WriteString(strings.Repeat("x", 89)) + if w1 != 1 { + t.Fatalf("write 1200 + 89 'x's: got %d writes, want 1", w1) + } + io.Copy(b1, onlyReader{strings.NewReader(strings.Repeat("x", 700))}) + if w1 != 1 { + t.Fatalf("write 1200 + 789 'x's: got %d writes, want 1", w1) + } + io.Copy(b1, onlyReader{strings.NewReader(strings.Repeat("x", 600))}) + if w1 != 2 { + t.Fatalf("write 1200 + 1389 'x's: got %d writes, want 2", w1) + } + b1.Flush() + if w1 != 3 { + t.Fatalf("flush 1200 + 1389 'x's: got %d writes, want 3", w1) + } +} + +// A writeCountingDiscard is like ioutil.Discard and counts the number of times +// Write is called on it. +type writeCountingDiscard int + +func (w *writeCountingDiscard) Write(p []byte) (int, error) { + *w++ + return len(p), nil +} + +type negativeReader int + +func (r *negativeReader) Read([]byte) (int, error) { return -1, nil } + +func TestNegativeRead(t *testing.T) { + // should panic with a description pointing at the reader, not at itself. + // (should NOT panic with slice index error, for example.) + b := NewReader(new(negativeReader)) + defer func() { + switch err := recover().(type) { + case nil: + t.Fatal("read did not panic") + case error: + if !strings.Contains(err.Error(), "reader returned negative count from Read") { + t.Fatalf("wrong panic: %v", err) + } + default: + t.Fatalf("unexpected panic value: %T(%v)", err, err) + } + }() + b.Read(make([]byte, 100)) +} + +// An onlyReader only implements io.Reader, no matter what other methods the underlying implementation may have. +type onlyReader struct { + r io.Reader +} + +func (r onlyReader) Read(b []byte) (int, error) { + return r.r.Read(b) +} + +// An onlyWriter only implements io.Writer, no matter what other methods the underlying implementation may have. +type onlyWriter struct { + w io.Writer +} + +func (w onlyWriter) Write(b []byte) (int, error) { + return w.w.Write(b) +} + +func BenchmarkReaderCopyOptimal(b *testing.B) { + // Optimal case is where the underlying reader implements io.WriterTo + for i := 0; i < b.N; i++ { + b.StopTimer() + src := NewReader(bytes.NewBuffer(make([]byte, 8192))) + dst := onlyWriter{new(bytes.Buffer)} + b.StartTimer() + io.Copy(dst, src) + } +} + +func BenchmarkReaderCopyUnoptimal(b *testing.B) { + // Unoptimal case is where the underlying reader doesn't implement io.WriterTo + for i := 0; i < b.N; i++ { + b.StopTimer() + src := NewReader(onlyReader{bytes.NewBuffer(make([]byte, 8192))}) + dst := onlyWriter{new(bytes.Buffer)} + b.StartTimer() + io.Copy(dst, src) + } +} + +func BenchmarkReaderCopyNoWriteTo(b *testing.B) { + for i := 0; i < b.N; i++ { + b.StopTimer() + src := onlyReader{NewReader(bytes.NewBuffer(make([]byte, 8192)))} + dst := onlyWriter{new(bytes.Buffer)} + b.StartTimer() + io.Copy(dst, src) + } +} + +func BenchmarkWriterCopyOptimal(b *testing.B) { + // Optimal case is where the underlying writer implements io.ReaderFrom + for i := 0; i < b.N; i++ { + b.StopTimer() + src := onlyReader{bytes.NewBuffer(make([]byte, 8192))} + dst := NewWriter(new(bytes.Buffer)) + b.StartTimer() + io.Copy(dst, src) + } +} + +func BenchmarkWriterCopyUnoptimal(b *testing.B) { + for i := 0; i < b.N; i++ { + b.StopTimer() + src := onlyReader{bytes.NewBuffer(make([]byte, 8192))} + dst := NewWriter(onlyWriter{new(bytes.Buffer)}) + b.StartTimer() + io.Copy(dst, src) + } +} + +func BenchmarkWriterCopyNoReadFrom(b *testing.B) { + for i := 0; i < b.N; i++ { + b.StopTimer() + src := onlyReader{bytes.NewBuffer(make([]byte, 8192))} + dst := onlyWriter{NewWriter(new(bytes.Buffer))} + b.StartTimer() + io.Copy(dst, src) + } +} diff --git a/src/pkg/bufio/example_test.go b/src/pkg/bufio/example_test.go new file mode 100644 index 000000000..b545ce39a --- /dev/null +++ b/src/pkg/bufio/example_test.go @@ -0,0 +1,74 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bufio_test + +import ( + "bufio" + "fmt" + "os" + "strconv" + "strings" +) + +// The simplest use of a Scanner, to read standard input as a set of lines. +func ExampleScanner_lines() { + scanner := bufio.NewScanner(os.Stdin) + for scanner.Scan() { + fmt.Println(scanner.Text()) // Println will add back the final '\n' + } + if err := scanner.Err(); err != nil { + fmt.Fprintln(os.Stdout, "reading standard input:", err) + } +} + +// Use a Scanner to implement a simple word-count utility by scanning the +// input as a sequence of space-delimited tokens. +func ExampleScanner_words() { + // An artificial input source. + const input = "Now is the winter of our discontent,\nMade glorious summer by this sun of York.\n" + scanner := bufio.NewScanner(strings.NewReader(input)) + // Set the split function for the scanning operation. + scanner.Split(bufio.ScanWords) + // Count the words. + count := 0 + for scanner.Scan() { + count++ + } + if err := scanner.Err(); err != nil { + fmt.Fprintln(os.Stdout, "reading input:", err) + } + fmt.Printf("%d\n", count) + // Output: 15 +} + +// Use a Scanner with a custom split function (built by wrapping ScanWords) to validate +// 32-bit decimal input. +func ExampleScanner_custom() { + // An artificial input source. + const input = "1234 5678 1234567901234567890" + scanner := bufio.NewScanner(strings.NewReader(input)) + // Create a custom split function by wrapping the existing ScanWords function. + split := func(data []byte, atEOF bool) (advance int, token []byte, err error) { + advance, token, err = bufio.ScanWords(data, atEOF) + if err == nil && token != nil { + _, err = strconv.ParseInt(string(token), 10, 32) + } + return + } + // Set the split function for the scanning operation. + scanner.Split(split) + // Validate the input + for scanner.Scan() { + fmt.Printf("%s\n", scanner.Text()) + } + + if err := scanner.Err(); err != nil { + fmt.Printf("Invalid input: %s", err) + } + // Output: + // 1234 + // 5678 + // Invalid input: strconv.ParseInt: parsing "1234567901234567890": value out of range +} diff --git a/src/pkg/bufio/export_test.go b/src/pkg/bufio/export_test.go new file mode 100644 index 000000000..3d3bb27d8 --- /dev/null +++ b/src/pkg/bufio/export_test.go @@ -0,0 +1,27 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bufio + +// Exported for testing only. +import ( + "unicode/utf8" +) + +var IsSpace = isSpace + +func (s *Scanner) MaxTokenSize(n int) { + if n < utf8.UTFMax || n > 1e9 { + panic("bad max token size") + } + if n < len(s.buf) { + s.buf = make([]byte, n) + } + s.maxTokenSize = n +} + +// ErrOrEOF is like Err, but returns EOF. Used to test a corner case. +func (s *Scanner) ErrOrEOF() error { + return s.err +} diff --git a/src/pkg/bufio/scan.go b/src/pkg/bufio/scan.go new file mode 100644 index 000000000..268ce6d1d --- /dev/null +++ b/src/pkg/bufio/scan.go @@ -0,0 +1,338 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bufio + +import ( + "bytes" + "errors" + "io" + "unicode/utf8" +) + +// Scanner provides a convenient interface for reading data such as +// a file of newline-delimited lines of text. Successive calls to +// the Scan method will step through the 'tokens' of a file, skipping +// the bytes between the tokens. The specification of a token is +// defined by a split function of type SplitFunc; the default split +// function breaks the input into lines with newlines stripped. Split +// functions are defined in this package for scanning a file into +// lines, bytes, UTF-8-encoded runes, and space-delimited words. The +// client may instead provide a custom split function. +// +// Scanning stops unrecoverably at EOF, the first I/O error, or a token too +// large to fit in the buffer. When a scan stops, the reader may have +// advanced arbitrarily far past the last token. Programs that need more +// control over error handling or large tokens, or must run sequential scans +// on a reader, should use bufio.Reader instead. +// +// TODO(r): Provide executable examples. +// +type Scanner struct { + r io.Reader // The reader provided by the client. + split SplitFunc // The function to split the tokens. + maxTokenSize int // Maximum size of a token; modified by tests. + token []byte // Last token returned by split. + buf []byte // Buffer used as argument to split. + start int // First non-processed byte in buf. + end int // End of data in buf. + err error // Sticky error. +} + +// SplitFunc is the signature of the split function used to tokenize the +// input. The arguments are an initial substring of the remaining unprocessed +// data and a flag, atEOF, that reports whether the Reader has no more data +// to give. The return values are the number of bytes to advance the input +// and the next token to return to the user, plus an error, if any. If the +// data does not yet hold a complete token, for instance if it has no newline +// while scanning lines, SplitFunc can return (0, nil) to signal the Scanner +// to read more data into the slice and try again with a longer slice +// starting at the same point in the input. +// +// If the returned error is non-nil, scanning stops and the error +// is returned to the client. +// +// The function is never called with an empty data slice unless atEOF +// is true. If atEOF is true, however, data may be non-empty and, +// as always, holds unprocessed text. +type SplitFunc func(data []byte, atEOF bool) (advance int, token []byte, err error) + +// Errors returned by Scanner. +var ( + ErrTooLong = errors.New("bufio.Scanner: token too long") + ErrNegativeAdvance = errors.New("bufio.Scanner: SplitFunc returns negative advance count") + ErrAdvanceTooFar = errors.New("bufio.Scanner: SplitFunc returns advance count beyond input") +) + +const ( + // Maximum size used to buffer a token. The actual maximum token size + // may be smaller as the buffer may need to include, for instance, a newline. + MaxScanTokenSize = 64 * 1024 +) + +// NewScanner returns a new Scanner to read from r. +func NewScanner(r io.Reader) *Scanner { + return &Scanner{ + r: r, + split: ScanLines, + maxTokenSize: MaxScanTokenSize, + buf: make([]byte, 4096), // Plausible starting size; needn't be large. + } +} + +// Err returns the first non-EOF error that was encountered by the Scanner. +func (s *Scanner) Err() error { + if s.err == io.EOF { + return nil + } + return s.err +} + +// Bytes returns the most recent token generated by a call to Scan. +// The underlying array may point to data that will be overwritten +// by a subsequent call to Scan. It does no allocation. +func (s *Scanner) Bytes() []byte { + return s.token +} + +// Text returns the most recent token generated by a call to Scan +// as a newly allocated string holding its bytes. +func (s *Scanner) Text() string { + return string(s.token) +} + +// Scan advances the Scanner to the next token, which will then be +// available through the Bytes or Text method. It returns false when the +// scan stops, either by reaching the end of the input or an error. +// After Scan returns false, the Err method will return any error that +// occurred during scanning, except that if it was io.EOF, Err +// will return nil. +func (s *Scanner) Scan() bool { + // Loop until we have a token. + for { + // See if we can get a token with what we already have. + if s.end > s.start { + advance, token, err := s.split(s.buf[s.start:s.end], s.err != nil) + if err != nil { + s.setErr(err) + return false + } + if !s.advance(advance) { + return false + } + s.token = token + if token != nil { + return true + } + } + // We cannot generate a token with what we are holding. + // If we've already hit EOF or an I/O error, we are done. + if s.err != nil { + // Shut it down. + s.start = 0 + s.end = 0 + return false + } + // Must read more data. + // First, shift data to beginning of buffer if there's lots of empty space + // or space is neded. + if s.start > 0 && (s.end == len(s.buf) || s.start > len(s.buf)/2) { + copy(s.buf, s.buf[s.start:s.end]) + s.end -= s.start + s.start = 0 + } + // Is the buffer full? If so, resize. + if s.end == len(s.buf) { + if len(s.buf) >= s.maxTokenSize { + s.setErr(ErrTooLong) + return false + } + newSize := len(s.buf) * 2 + if newSize > s.maxTokenSize { + newSize = s.maxTokenSize + } + newBuf := make([]byte, newSize) + copy(newBuf, s.buf[s.start:s.end]) + s.buf = newBuf + s.end -= s.start + s.start = 0 + continue + } + // Finally we can read some input. + n, err := s.r.Read(s.buf[s.end:len(s.buf)]) + if err != nil { + s.setErr(err) + } + if n == 0 { // Don't loop forever if Reader doesn't deliver EOF. + s.err = io.EOF + } + s.end += n + } + panic("not reached") +} + +// advance consumes n bytes of the buffer. It reports whether the advance was legal. +func (s *Scanner) advance(n int) bool { + if n < 0 { + s.setErr(ErrNegativeAdvance) + return false + } + if n > s.end-s.start { + s.setErr(ErrAdvanceTooFar) + return false + } + s.start += n + return true +} + +// setErr records the first error encountered. +func (s *Scanner) setErr(err error) { + if s.err == nil || s.err == io.EOF { + s.err = err + } +} + +// Split sets the split function for the Scanner. If called, it must be +// called before Scan. The default split function is ScanLines. +func (s *Scanner) Split(split SplitFunc) { + s.split = split +} + +// Split functions + +// ScanBytes is a split function for a Scanner that returns each byte as a token. +func ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + return 1, data[0:1], nil +} + +var errorRune = []byte(string(utf8.RuneError)) + +// ScanRunes is a split function for a Scanner that returns each +// UTF-8-encoded rune as a token. The sequence of runes returned is +// equivalent to that from a range loop over the input as a string, which +// means that erroneous UTF-8 encodings translate to U+FFFD = "\xef\xbf\xbd". +// Because of the Scan interface, this makes it impossible for the client to +// distinguish correctly encoded replacement runes from encoding errors. +func ScanRunes(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + + // Fast path 1: ASCII. + if data[0] < utf8.RuneSelf { + return 1, data[0:1], nil + } + + // Fast path 2: Correct UTF-8 decode without error. + _, width := utf8.DecodeRune(data) + if width > 1 { + // It's a valid encoding. Width cannot be one for a correctly encoded + // non-ASCII rune. + return width, data[0:width], nil + } + + // We know it's an error: we have width==1 and implicitly r==utf8.RuneError. + // Is the error because there wasn't a full rune to be decoded? + // FullRune distinguishes correctly between erroneous and incomplete encodings. + if !atEOF && !utf8.FullRune(data) { + // Incomplete; get more bytes. + return 0, nil, nil + } + + // We have a real UTF-8 encoding error. Return a properly encoded error rune + // but advance only one byte. This matches the behavior of a range loop over + // an incorrectly encoded string. + return 1, errorRune, nil +} + +// dropCR drops a terminal \r from the data. +func dropCR(data []byte) []byte { + if len(data) > 0 && data[len(data)-1] == '\r' { + return data[0 : len(data)-1] + } + return data +} + +// ScanLines is a split function for a Scanner that returns each line of +// text, stripped of any trailing end-of-line marker. The returned line may +// be empty. The end-of-line marker is one optional carriage return followed +// by one mandatory newline. In regular expression notation, it is `\r?\n'. +// The last non-empty line of input will be returned even if it has no +// newline. +func ScanLines(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + if i := bytes.IndexByte(data, '\n'); i >= 0 { + // We have a full newline-terminated line. + return i + 1, dropCR(data[0:i]), nil + } + // If we're at EOF, we have a final, non-terminated line. Return it. + if atEOF { + return len(data), dropCR(data), nil + } + // Request more data. + return 0, nil, nil +} + +// isSpace returns whether the character is a Unicode white space character. +// We avoid dependency on the unicode package, but check validity of the implementation +// in the tests. +func isSpace(r rune) bool { + if r <= '\u00FF' { + // Obvious ASCII ones: \t through \r plus space. Plus two Latin-1 oddballs. + switch r { + case ' ', '\t', '\n', '\v', '\f', '\r': + return true + case '\u0085', '\u00A0': + return true + } + return false + } + // High-valued ones. + if '\u2000' <= r && r <= '\u200a' { + return true + } + switch r { + case '\u1680', '\u180e', '\u2028', '\u2029', '\u202f', '\u205f', '\u3000': + return true + } + return false +} + +// ScanWords is a split function for a Scanner that returns each +// space-separated word of text, with surrounding spaces deleted. It will +// never return an empty string. The definition of space is set by +// unicode.IsSpace. +func ScanWords(data []byte, atEOF bool) (advance int, token []byte, err error) { + // Skip leading spaces. + start := 0 + for width := 0; start < len(data); start += width { + var r rune + r, width = utf8.DecodeRune(data[start:]) + if !isSpace(r) { + break + } + } + if atEOF && len(data) == 0 { + return 0, nil, nil + } + // Scan until space, marking end of word. + for width, i := 0, start; i < len(data); i += width { + var r rune + r, width = utf8.DecodeRune(data[i:]) + if isSpace(r) { + return i + width, data[start:i], nil + } + } + // If we're at EOF, we have a final, non-empty, non-terminated word. Return it. + if atEOF && len(data) > start { + return len(data), data[start:], nil + } + // Request more data. + return 0, nil, nil +} diff --git a/src/pkg/bufio/scan_test.go b/src/pkg/bufio/scan_test.go new file mode 100644 index 000000000..48729aabb --- /dev/null +++ b/src/pkg/bufio/scan_test.go @@ -0,0 +1,370 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bufio_test + +import ( + . "bufio" + "bytes" + "errors" + "io" + "strings" + "testing" + "unicode" + "unicode/utf8" +) + +// Test white space table matches the Unicode definition. +func TestSpace(t *testing.T) { + for r := rune(0); r <= utf8.MaxRune; r++ { + if IsSpace(r) != unicode.IsSpace(r) { + t.Fatalf("white space property disagrees: %#U should be %t", r, unicode.IsSpace(r)) + } + } +} + +var scanTests = []string{ + "", + "a", + "¼", + "☹", + "\x81", // UTF-8 error + "\uFFFD", // correctly encoded RuneError + "abcdefgh", + "abc def\n\t\tgh ", + "abc¼☹\x81\uFFFD日本語\x82abc", +} + +func TestScanByte(t *testing.T) { + for n, test := range scanTests { + buf := bytes.NewBufferString(test) + s := NewScanner(buf) + s.Split(ScanBytes) + var i int + for i = 0; s.Scan(); i++ { + if b := s.Bytes(); len(b) != 1 || b[0] != test[i] { + t.Errorf("#%d: %d: expected %q got %q", n, i, test, b) + } + } + if i != len(test) { + t.Errorf("#%d: termination expected at %d; got %d", n, len(test), i) + } + err := s.Err() + if err != nil { + t.Errorf("#%d: %v", n, err) + } + } +} + +// Test that the rune splitter returns same sequence of runes (not bytes) as for range string. +func TestScanRune(t *testing.T) { + for n, test := range scanTests { + buf := bytes.NewBufferString(test) + s := NewScanner(buf) + s.Split(ScanRunes) + var i, runeCount int + var expect rune + // Use a string range loop to validate the sequence of runes. + for i, expect = range string(test) { + if !s.Scan() { + break + } + runeCount++ + got, _ := utf8.DecodeRune(s.Bytes()) + if got != expect { + t.Errorf("#%d: %d: expected %q got %q", n, i, expect, got) + } + } + if s.Scan() { + t.Errorf("#%d: scan ran too long, got %q", n, s.Text()) + } + testRuneCount := utf8.RuneCountInString(test) + if runeCount != testRuneCount { + t.Errorf("#%d: termination expected at %d; got %d", n, testRuneCount, runeCount) + } + err := s.Err() + if err != nil { + t.Errorf("#%d: %v", n, err) + } + } +} + +var wordScanTests = []string{ + "", + " ", + "\n", + "a", + " a ", + "abc def", + " abc def ", + " abc\tdef\nghi\rjkl\fmno\vpqr\u0085stu\u00a0\n", +} + +// Test that the word splitter returns the same data as strings.Fields. +func TestScanWords(t *testing.T) { + for n, test := range wordScanTests { + buf := bytes.NewBufferString(test) + s := NewScanner(buf) + s.Split(ScanWords) + words := strings.Fields(test) + var wordCount int + for wordCount = 0; wordCount < len(words); wordCount++ { + if !s.Scan() { + break + } + got := s.Text() + if got != words[wordCount] { + t.Errorf("#%d: %d: expected %q got %q", n, wordCount, words[wordCount], got) + } + } + if s.Scan() { + t.Errorf("#%d: scan ran too long, got %q", n, s.Text()) + } + if wordCount != len(words) { + t.Errorf("#%d: termination expected at %d; got %d", n, len(words), wordCount) + } + err := s.Err() + if err != nil { + t.Errorf("#%d: %v", n, err) + } + } +} + +// slowReader is a reader that returns only a few bytes at a time, to test the incremental +// reads in Scanner.Scan. +type slowReader struct { + max int + buf *bytes.Buffer +} + +func (sr *slowReader) Read(p []byte) (n int, err error) { + if len(p) > sr.max { + p = p[0:sr.max] + } + return sr.buf.Read(p) +} + +// genLine writes to buf a predictable but non-trivial line of text of length +// n, including the terminal newline and an occasional carriage return. +// If addNewline is false, the \r and \n are not emitted. +func genLine(buf *bytes.Buffer, lineNum, n int, addNewline bool) { + buf.Reset() + doCR := lineNum%5 == 0 + if doCR { + n-- + } + for i := 0; i < n-1; i++ { // Stop early for \n. + c := 'a' + byte(lineNum+i) + if c == '\n' || c == '\r' { // Don't confuse us. + c = 'N' + } + buf.WriteByte(c) + } + if addNewline { + if doCR { + buf.WriteByte('\r') + } + buf.WriteByte('\n') + } + return +} + +// Test the line splitter, including some carriage returns but no long lines. +func TestScanLongLines(t *testing.T) { + const smallMaxTokenSize = 256 // Much smaller for more efficient testing. + // Build a buffer of lots of line lengths up to but not exceeding smallMaxTokenSize. + tmp := new(bytes.Buffer) + buf := new(bytes.Buffer) + lineNum := 0 + j := 0 + for i := 0; i < 2*smallMaxTokenSize; i++ { + genLine(tmp, lineNum, j, true) + if j < smallMaxTokenSize { + j++ + } else { + j-- + } + buf.Write(tmp.Bytes()) + lineNum++ + } + s := NewScanner(&slowReader{1, buf}) + s.Split(ScanLines) + s.MaxTokenSize(smallMaxTokenSize) + j = 0 + for lineNum := 0; s.Scan(); lineNum++ { + genLine(tmp, lineNum, j, false) + if j < smallMaxTokenSize { + j++ + } else { + j-- + } + line := tmp.String() // We use the string-valued token here, for variety. + if s.Text() != line { + t.Errorf("%d: bad line: %d %d\n%.100q\n%.100q\n", lineNum, len(s.Bytes()), len(line), s.Text(), line) + } + } + err := s.Err() + if err != nil { + t.Fatal(err) + } +} + +// Test that the line splitter errors out on a long line. +func TestScanLineTooLong(t *testing.T) { + const smallMaxTokenSize = 256 // Much smaller for more efficient testing. + // Build a buffer of lots of line lengths up to but not exceeding smallMaxTokenSize. + tmp := new(bytes.Buffer) + buf := new(bytes.Buffer) + lineNum := 0 + j := 0 + for i := 0; i < 2*smallMaxTokenSize; i++ { + genLine(tmp, lineNum, j, true) + j++ + buf.Write(tmp.Bytes()) + lineNum++ + } + s := NewScanner(&slowReader{3, buf}) + s.Split(ScanLines) + s.MaxTokenSize(smallMaxTokenSize) + j = 0 + for lineNum := 0; s.Scan(); lineNum++ { + genLine(tmp, lineNum, j, false) + if j < smallMaxTokenSize { + j++ + } else { + j-- + } + line := tmp.Bytes() + if !bytes.Equal(s.Bytes(), line) { + t.Errorf("%d: bad line: %d %d\n%.100q\n%.100q\n", lineNum, len(s.Bytes()), len(line), s.Bytes(), line) + } + } + err := s.Err() + if err != ErrTooLong { + t.Fatalf("expected ErrTooLong; got %s", err) + } +} + +// Test that the line splitter handles a final line without a newline. +func testNoNewline(text string, lines []string, t *testing.T) { + buf := bytes.NewBufferString(text) + s := NewScanner(&slowReader{7, buf}) + s.Split(ScanLines) + for lineNum := 0; s.Scan(); lineNum++ { + line := lines[lineNum] + if s.Text() != line { + t.Errorf("%d: bad line: %d %d\n%.100q\n%.100q\n", lineNum, len(s.Bytes()), len(line), s.Bytes(), line) + } + } + err := s.Err() + if err != nil { + t.Fatal(err) + } +} + +var noNewlineLines = []string{ + "abcdefghijklmn\nopqrstuvwxyz", +} + +// Test that the line splitter handles a final line without a newline. +func TestScanLineNoNewline(t *testing.T) { + const text = "abcdefghijklmn\nopqrstuvwxyz" + lines := []string{ + "abcdefghijklmn", + "opqrstuvwxyz", + } + testNoNewline(text, lines, t) +} + +// Test that the line splitter handles a final line with a carriage return but nonewline. +func TestScanLineReturnButNoNewline(t *testing.T) { + const text = "abcdefghijklmn\nopqrstuvwxyz\r" + lines := []string{ + "abcdefghijklmn", + "opqrstuvwxyz", + } + testNoNewline(text, lines, t) +} + +// Test that the line splitter handles a final empty line. +func TestScanLineEmptyFinalLine(t *testing.T) { + const text = "abcdefghijklmn\nopqrstuvwxyz\n\n" + lines := []string{ + "abcdefghijklmn", + "opqrstuvwxyz", + "", + } + testNoNewline(text, lines, t) +} + +// Test that the line splitter handles a final empty line with a carriage return but no newline. +func TestScanLineEmptyFinalLineWithCR(t *testing.T) { + const text = "abcdefghijklmn\nopqrstuvwxyz\n\r" + lines := []string{ + "abcdefghijklmn", + "opqrstuvwxyz", + "", + } + testNoNewline(text, lines, t) +} + +var testError = errors.New("testError") + +// Test the correct error is returned when the split function errors out. +func TestSplitError(t *testing.T) { + // Create a split function that delivers a little data, then a predictable error. + numSplits := 0 + const okCount = 7 + errorSplit := func(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF { + panic("didn't get enough data") + } + if numSplits >= okCount { + return 0, nil, testError + } + numSplits++ + return 1, data[0:1], nil + } + // Read the data. + const text = "abcdefghijklmnopqrstuvwxyz" + buf := bytes.NewBufferString(text) + s := NewScanner(&slowReader{1, buf}) + s.Split(errorSplit) + var i int + for i = 0; s.Scan(); i++ { + if len(s.Bytes()) != 1 || text[i] != s.Bytes()[0] { + t.Errorf("#%d: expected %q got %q", i, text[i], s.Bytes()[0]) + } + } + // Check correct termination location and error. + if i != okCount { + t.Errorf("unexpected termination; expected %d tokens got %d", okCount, i) + } + err := s.Err() + if err != testError { + t.Fatalf("expected %q got %v", testError, err) + } +} + +// Test that an EOF is overridden by a user-generated scan error. +func TestErrAtEOF(t *testing.T) { + s := NewScanner(strings.NewReader("1 2 33")) + // This spitter will fail on last entry, after s.err==EOF. + split := func(data []byte, atEOF bool) (advance int, token []byte, err error) { + advance, token, err = ScanWords(data, atEOF) + if len(token) > 1 { + if s.ErrOrEOF() != io.EOF { + t.Fatal("not testing EOF") + } + err = testError + } + return + } + s.Split(split) + for s.Scan() { + } + if s.Err() != testError { + t.Fatal("wrong error:", s.Err()) + } +} |