Imported Upstream version 59upstream/59

author: Ondřej Surý <ondrej@sury.org> 2011-08-03 16:54:30 +0200
committer: Ondřej Surý <ondrej@sury.org> 2011-08-03 16:54:30 +0200
commit: 28592ee1ea1f5cdffcf85472f9de0285d928cf12 (patch)
tree: 32944e18b23f7fe4a0818a694aa2a6dfb1835463 /src/pkg/csv/reader.go
parent: e836bee4716dc0d4d913537ad3ad1925a7ac32d0 (diff)
download: golang-upstream/59.tar.gz
1 files changed, 373 insertions, 0 deletions
diff --git a/src/pkg/csv/reader.go b/src/pkg/csv/reader.go
new file mode 100644
index 000000000..1f4b61cf9
--- /dev/null
+++ b/src/pkg/csv/reader.go
@@ -0,0 +1,373 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package csv reads and writes comma-separated values (CSV) files.
+//
+// A csv file contains zero or more records of one or more fields per record.
+// Each record is separated by the newline character. The final record may
+// optionally be followed by a newline character.
+//
+//	field1,field2,field3
+//
+// White space is considered part of a field.
+//
+// Carriage returns before newline characters are silently removed.
+//
+// Blank lines are ignored.  A line with only whitespace characters (excluding
+// the ending newline character) is not considered a blank line.
+//
+// Fields which start and stop with the quote character " are called
+// quoted-fields.  The beginning and ending quote are not part of the
+// field.
+//
+// The source:
+//
+//	normal string,"quoted-field"
+//
+// results in the fields
+//
+//	{`normal string`, `quoted-field`}
+//
+// Within a quoted-field a quote character followed by a second quote
+// character is considered a single quote.
+//
+//	"the ""word"" is true","a ""quoted-field"""
+//
+// results in
+//
+//	{`the "word" is true`, `a "quoted-field"`}
+//
+// Newlines and commas may be included in a quoted-field
+//
+//	"Multi-line
+//	field","comma is ,"
+//
+// results in
+//
+//	{`Multi-line
+//	field`, `comma is ,`}
+package csv
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"io"
+	"os"
+	"unicode"
+)
+
+// A ParseError is returned for parsing errors.
+// The first line is 1.  The first column is 0.
+type ParseError struct {
+	Line   int      // Line where the error occurred
+	Column int      // Column (rune index) where the error occurred
+	Error  os.Error // The actual error
+}
+
+func (e *ParseError) String() string {
+	return fmt.Sprintf("line %d, column %d: %s", e.Line, e.Column, e.Error)
+}
+
+// These are the errors that can be returned in ParseError.Error
+var (
+	ErrTrailingComma = os.NewError("extra delimiter at end of line")
+	ErrBareQuote     = os.NewError("bare \" in non-quoted-field")
+	ErrQuote         = os.NewError("extraneous \" in field")
+	ErrFieldCount    = os.NewError("wrong number of fields in line")
+)
+
+// A Reader reads records from a CSV-encoded file.
+//
+// As returned by NewReader, a Reader expects input conforming to RFC 4180.
+// The exported fields can be changed to customize the details before the
+// first call to Read or ReadAll.
+//
+// Comma is the field delimiter.  It defaults to ','.
+//
+// Comment, if not 0, is the comment character. Lines beginning with the
+// Comment character is ignored.
+//
+// If FieldsPerRecord is positive, Read requires each record to
+// have the given number of fields.  If FieldsPerRecord is 0, Read sets it to
+// the number of fields in the first record, so that future records must
+// have the same field count.
+//
+// If LazyQuotes is true, a quote may appear in an unquoted field and a
+// non-doubled quote may appear in a quoted field.
+//
+// If TrailingComma is true, the last field may be a unquoted empty field.
+//
+// If TrimLeadingSpace is true, leading white space in a field is ignored.
+type Reader struct {
+	Comma            int  // Field delimiter (set to ',' by NewReader)
+	Comment          int  // Comment character for start of line
+	FieldsPerRecord  int  // Number of expected fields per record
+	LazyQuotes       bool // Allow lazy quotes
+	TrailingComma    bool // Allow trailing comma
+	TrimLeadingSpace bool // Trim leading space
+	line             int
+	column           int
+	r                *bufio.Reader
+	field            bytes.Buffer
+}
+
+// NewReader returns a new Reader that reads from r.
+func NewReader(r io.Reader) *Reader {
+	return &Reader{
+		Comma: ',',
+		r:     bufio.NewReader(r),
+	}
+}
+
+// error creates a new ParseError based on err.
+func (r *Reader) error(err os.Error) os.Error {
+	return &ParseError{
+		Line:   r.line,
+		Column: r.column,
+		Error:  err,
+	}
+}
+
+// Read reads one record from r.  The record is a slice of strings with each
+// string representing one field.
+func (r *Reader) Read() (record []string, err os.Error) {
+	for {
+		record, err = r.parseRecord()
+		if record != nil {
+			break
+		}
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	if r.FieldsPerRecord > 0 {
+		if len(record) != r.FieldsPerRecord {
+			r.column = 0 // report at start of record
+			return record, r.error(ErrFieldCount)
+		}
+	} else if r.FieldsPerRecord == 0 {
+		r.FieldsPerRecord = len(record)
+	}
+	return record, nil
+}
+
+// ReadAll reads all the remaining records from r.
+// Each record is a slice of fields.
+func (r *Reader) ReadAll() (records [][]string, err os.Error) {
+	for {
+		record, err := r.Read()
+		if err == os.EOF {
+			return records, nil
+		}
+		if err != nil {
+			return nil, err
+		}
+		records = append(records, record)
+	}
+	panic("unreachable")
+}
+
+// readRune reads one rune from r, folding \r\n to \n and keeping track
+// of our far into the line we have read.  r.column will point to the start
+// of this rune, not the end of this rune.
+func (r *Reader) readRune() (int, os.Error) {
+	rune, _, err := r.r.ReadRune()
+
+	// Handle \r\n here.  We make the simplifying assumption that
+	// anytime \r is followed by \n that it can be folded to \n.
+	// We will not detect files which contain both \r\n and bare \n.
+	if rune == '\r' {
+		rune, _, err = r.r.ReadRune()
+		if err == nil {
+			if rune != '\n' {
+				r.r.UnreadRune()
+				rune = '\r'
+			}
+		}
+	}
+	r.column++
+	return rune, err
+}
+
+// unreadRune puts the last rune read from r back.
+func (r *Reader) unreadRune() {
+	r.r.UnreadRune()
+	r.column--
+}
+
+// skip reads runes up to and including the rune delim or until error.
+func (r *Reader) skip(delim int) os.Error {
+	for {
+		rune, err := r.readRune()
+		if err != nil {
+			return err
+		}
+		if rune == delim {
+			return nil
+		}
+	}
+	panic("unreachable")
+}
+
+// parseRecord reads and parses a single csv record from r.
+func (r *Reader) parseRecord() (fields []string, err os.Error) {
+	// Each record starts on a new line.  We increment our line
+	// number (lines start at 1, not 0) and set column to -1
+	// so as we increment in readRune it points to the character we read.
+	r.line++
+	r.column = -1
+
+	// Peek at the first rune.  If it is an error we are done.
+	// If we are support comments and it is the comment character
+	// the skip to the end of line.
+
+	rune, _, err := r.r.ReadRune()
+	if err != nil {
+		return nil, err
+	}
+
+	if r.Comment != 0 && rune == r.Comment {
+		return nil, r.skip('\n')
+	}
+	r.r.UnreadRune()
+
+	// At this point we have at least one field.
+	for {
+		haveField, delim, err := r.parseField()
+		if haveField {
+			fields = append(fields, r.field.String())
+		}
+		if delim == '\n' || err == os.EOF {
+			return fields, err
+		} else if err != nil {
+			return nil, err
+		}
+	}
+	panic("unreachable")
+}
+
+
+// parseField parses the next field in the record.  The read field is
+// located in r.field.  Delim is the first character not part of the field
+// (r.Comma or '\n').
+func (r *Reader) parseField() (haveField bool, delim int, err os.Error) {
+	r.field.Reset()
+
+	rune, err := r.readRune()
+	if err != nil {
+		// If we have EOF and are not at the start of a line
+		// then we return the empty field.  We have already
+		// checked for trailing commas if needed.
+		if err == os.EOF && r.column != 0 {
+			return true, 0, err
+		}
+		return false, 0, err
+	}
+
+	if r.TrimLeadingSpace {
+		for unicode.IsSpace(rune) {
+			rune, err = r.readRune()
+			if err != nil {
+				return false, 0, err
+			}
+		}
+	}
+
+	switch rune {
+	case r.Comma:
+		// will check below
+
+	case '\n':
+		// We are a trailing empty field or a blank linke
+		if r.column == 0 {
+			return false, rune, nil
+		}
+		return true, rune, nil
+
+	case '"':
+		// quoted field
+	Quoted:
+		for {
+			rune, err = r.readRune()
+			if err != nil {
+				if err == os.EOF {
+					if r.LazyQuotes {
+						return true, 0, err
+					}
+					return false, 0, r.error(ErrQuote)
+				}
+				return false, 0, err
+			}
+			switch rune {
+			case '"':
+				rune, err = r.readRune()
+				if err != nil || rune == r.Comma {
+					break Quoted
+				}
+				if rune == '\n' {
+					return true, rune, nil
+				}
+				if rune != '"' {
+					if !r.LazyQuotes {
+						r.column--
+						return false, 0, r.error(ErrQuote)
+					}
+					// accept the bare quote
+					r.field.WriteRune('"')
+				}
+			case '\n':
+				r.line++
+				r.column = -1
+			}
+			r.field.WriteRune(rune)
+		}
+
+	default:
+		// unquoted field
+		for {
+			r.field.WriteRune(rune)
+			rune, err = r.readRune()
+			if err != nil || rune == r.Comma {
+				break
+			}
+			if rune == '\n' {
+				return true, rune, nil
+			}
+			if !r.LazyQuotes && rune == '"' {
+				return false, 0, r.error(ErrBareQuote)
+			}
+		}
+	}
+
+	if err != nil {
+		if err == os.EOF {
+			return true, 0, err
+		}
+		return false, 0, err
+	}
+
+	if !r.TrailingComma {
+		// We don't allow trailing commas.  See if we
+		// are at the end of the line (being mindful
+		// of triming spaces
+		c := r.column
+		rune, err = r.readRune()
+		if r.TrimLeadingSpace {
+			for unicode.IsSpace(rune) {
+				rune, err = r.readRune()
+				if err != nil {
+					break
+				}
+			}
+		}
+		if err == os.EOF || rune == '\n' {
+			r.column = c // report the comma
+			return false, 0, r.error(ErrTrailingComma)
+		}
+		r.unreadRune()
+	}
+	return true, rune, nil
+}
author	Ondřej Surý <ondrej@sury.org>	2011-08-03 16:54:30 +0200
committer	Ondřej Surý <ondrej@sury.org>	2011-08-03 16:54:30 +0200
commit	28592ee1ea1f5cdffcf85472f9de0285d928cf12 (patch)
tree	32944e18b23f7fe4a0818a694aa2a6dfb1835463 /src/pkg/csv/reader.go
parent	e836bee4716dc0d4d913537ad3ad1925a7ac32d0 (diff)
download	golang-upstream/59.tar.gz