summaryrefslogtreecommitdiff
path: root/src/encoding/csv/reader.go
diff options
context:
space:
mode:
Diffstat (limited to 'src/encoding/csv/reader.go')
-rw-r--r--src/encoding/csv/reader.go337
1 files changed, 337 insertions, 0 deletions
diff --git a/src/encoding/csv/reader.go b/src/encoding/csv/reader.go
new file mode 100644
index 000000000..d9432954a
--- /dev/null
+++ b/src/encoding/csv/reader.go
@@ -0,0 +1,337 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package csv reads and writes comma-separated values (CSV) files.
+//
+// A csv file contains zero or more records of one or more fields per record.
+// Each record is separated by the newline character. The final record may
+// optionally be followed by a newline character.
+//
+// field1,field2,field3
+//
+// White space is considered part of a field.
+//
+// Carriage returns before newline characters are silently removed.
+//
+// Blank lines are ignored. A line with only whitespace characters (excluding
+// the ending newline character) is not considered a blank line.
+//
+// Fields which start and stop with the quote character " are called
+// quoted-fields. The beginning and ending quote are not part of the
+// field.
+//
+// The source:
+//
+// normal string,"quoted-field"
+//
+// results in the fields
+//
+// {`normal string`, `quoted-field`}
+//
+// Within a quoted-field a quote character followed by a second quote
+// character is considered a single quote.
+//
+// "the ""word"" is true","a ""quoted-field"""
+//
+// results in
+//
+// {`the "word" is true`, `a "quoted-field"`}
+//
+// Newlines and commas may be included in a quoted-field
+//
+// "Multi-line
+// field","comma is ,"
+//
+// results in
+//
+// {`Multi-line
+// field`, `comma is ,`}
+package csv
+
+import (
+ "bufio"
+ "bytes"
+ "errors"
+ "fmt"
+ "io"
+ "unicode"
+)
+
+// A ParseError is returned for parsing errors.
+// The first line is 1. The first column is 0.
+type ParseError struct {
+ Line int // Line where the error occurred
+ Column int // Column (rune index) where the error occurred
+ Err error // The actual error
+}
+
+func (e *ParseError) Error() string {
+ return fmt.Sprintf("line %d, column %d: %s", e.Line, e.Column, e.Err)
+}
+
+// These are the errors that can be returned in ParseError.Error
+var (
+ ErrTrailingComma = errors.New("extra delimiter at end of line") // no longer used
+ ErrBareQuote = errors.New("bare \" in non-quoted-field")
+ ErrQuote = errors.New("extraneous \" in field")
+ ErrFieldCount = errors.New("wrong number of fields in line")
+)
+
+// A Reader reads records from a CSV-encoded file.
+//
+// As returned by NewReader, a Reader expects input conforming to RFC 4180.
+// The exported fields can be changed to customize the details before the
+// first call to Read or ReadAll.
+//
+// Comma is the field delimiter. It defaults to ','.
+//
+// Comment, if not 0, is the comment character. Lines beginning with the
+// Comment character are ignored.
+//
+// If FieldsPerRecord is positive, Read requires each record to
+// have the given number of fields. If FieldsPerRecord is 0, Read sets it to
+// the number of fields in the first record, so that future records must
+// have the same field count. If FieldsPerRecord is negative, no check is
+// made and records may have a variable number of fields.
+//
+// If LazyQuotes is true, a quote may appear in an unquoted field and a
+// non-doubled quote may appear in a quoted field.
+//
+// If TrimLeadingSpace is true, leading white space in a field is ignored.
+type Reader struct {
+ Comma rune // field delimiter (set to ',' by NewReader)
+ Comment rune // comment character for start of line
+ FieldsPerRecord int // number of expected fields per record
+ LazyQuotes bool // allow lazy quotes
+ TrailingComma bool // ignored; here for backwards compatibility
+ TrimLeadingSpace bool // trim leading space
+ line int
+ column int
+ r *bufio.Reader
+ field bytes.Buffer
+}
+
+// NewReader returns a new Reader that reads from r.
+func NewReader(r io.Reader) *Reader {
+ return &Reader{
+ Comma: ',',
+ r: bufio.NewReader(r),
+ }
+}
+
+// error creates a new ParseError based on err.
+func (r *Reader) error(err error) error {
+ return &ParseError{
+ Line: r.line,
+ Column: r.column,
+ Err: err,
+ }
+}
+
+// Read reads one record from r. The record is a slice of strings with each
+// string representing one field.
+func (r *Reader) Read() (record []string, err error) {
+ for {
+ record, err = r.parseRecord()
+ if record != nil {
+ break
+ }
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ if r.FieldsPerRecord > 0 {
+ if len(record) != r.FieldsPerRecord {
+ r.column = 0 // report at start of record
+ return record, r.error(ErrFieldCount)
+ }
+ } else if r.FieldsPerRecord == 0 {
+ r.FieldsPerRecord = len(record)
+ }
+ return record, nil
+}
+
+// ReadAll reads all the remaining records from r.
+// Each record is a slice of fields.
+// A successful call returns err == nil, not err == EOF. Because ReadAll is
+// defined to read until EOF, it does not treat end of file as an error to be
+// reported.
+func (r *Reader) ReadAll() (records [][]string, err error) {
+ for {
+ record, err := r.Read()
+ if err == io.EOF {
+ return records, nil
+ }
+ if err != nil {
+ return nil, err
+ }
+ records = append(records, record)
+ }
+}
+
+// readRune reads one rune from r, folding \r\n to \n and keeping track
+// of how far into the line we have read. r.column will point to the start
+// of this rune, not the end of this rune.
+func (r *Reader) readRune() (rune, error) {
+ r1, _, err := r.r.ReadRune()
+
+ // Handle \r\n here. We make the simplifying assumption that
+ // anytime \r is followed by \n that it can be folded to \n.
+ // We will not detect files which contain both \r\n and bare \n.
+ if r1 == '\r' {
+ r1, _, err = r.r.ReadRune()
+ if err == nil {
+ if r1 != '\n' {
+ r.r.UnreadRune()
+ r1 = '\r'
+ }
+ }
+ }
+ r.column++
+ return r1, err
+}
+
+// skip reads runes up to and including the rune delim or until error.
+func (r *Reader) skip(delim rune) error {
+ for {
+ r1, err := r.readRune()
+ if err != nil {
+ return err
+ }
+ if r1 == delim {
+ return nil
+ }
+ }
+}
+
+// parseRecord reads and parses a single csv record from r.
+func (r *Reader) parseRecord() (fields []string, err error) {
+ // Each record starts on a new line. We increment our line
+ // number (lines start at 1, not 0) and set column to -1
+ // so as we increment in readRune it points to the character we read.
+ r.line++
+ r.column = -1
+
+ // Peek at the first rune. If it is an error we are done.
+ // If we are support comments and it is the comment character
+ // then skip to the end of line.
+
+ r1, _, err := r.r.ReadRune()
+ if err != nil {
+ return nil, err
+ }
+
+ if r.Comment != 0 && r1 == r.Comment {
+ return nil, r.skip('\n')
+ }
+ r.r.UnreadRune()
+
+ // At this point we have at least one field.
+ for {
+ haveField, delim, err := r.parseField()
+ if haveField {
+ fields = append(fields, r.field.String())
+ }
+ if delim == '\n' || err == io.EOF {
+ return fields, err
+ } else if err != nil {
+ return nil, err
+ }
+ }
+}
+
+// parseField parses the next field in the record. The read field is
+// located in r.field. Delim is the first character not part of the field
+// (r.Comma or '\n').
+func (r *Reader) parseField() (haveField bool, delim rune, err error) {
+ r.field.Reset()
+
+ r1, err := r.readRune()
+ for err == nil && r.TrimLeadingSpace && r1 != '\n' && unicode.IsSpace(r1) {
+ r1, err = r.readRune()
+ }
+
+ if err == io.EOF && r.column != 0 {
+ return true, 0, err
+ }
+ if err != nil {
+ return false, 0, err
+ }
+
+ switch r1 {
+ case r.Comma:
+ // will check below
+
+ case '\n':
+ // We are a trailing empty field or a blank line
+ if r.column == 0 {
+ return false, r1, nil
+ }
+ return true, r1, nil
+
+ case '"':
+ // quoted field
+ Quoted:
+ for {
+ r1, err = r.readRune()
+ if err != nil {
+ if err == io.EOF {
+ if r.LazyQuotes {
+ return true, 0, err
+ }
+ return false, 0, r.error(ErrQuote)
+ }
+ return false, 0, err
+ }
+ switch r1 {
+ case '"':
+ r1, err = r.readRune()
+ if err != nil || r1 == r.Comma {
+ break Quoted
+ }
+ if r1 == '\n' {
+ return true, r1, nil
+ }
+ if r1 != '"' {
+ if !r.LazyQuotes {
+ r.column--
+ return false, 0, r.error(ErrQuote)
+ }
+ // accept the bare quote
+ r.field.WriteRune('"')
+ }
+ case '\n':
+ r.line++
+ r.column = -1
+ }
+ r.field.WriteRune(r1)
+ }
+
+ default:
+ // unquoted field
+ for {
+ r.field.WriteRune(r1)
+ r1, err = r.readRune()
+ if err != nil || r1 == r.Comma {
+ break
+ }
+ if r1 == '\n' {
+ return true, r1, nil
+ }
+ if !r.LazyQuotes && r1 == '"' {
+ return false, 0, r.error(ErrBareQuote)
+ }
+ }
+ }
+
+ if err != nil {
+ if err == io.EOF {
+ return true, 0, err
+ }
+ return false, 0, err
+ }
+
+ return true, r1, nil
+}