...
Run Format

Source file src/encoding/csv/reader.go

Documentation: encoding/csv

  // Copyright 2011 The Go Authors. All rights reserved.
  // Use of this source code is governed by a BSD-style
  // license that can be found in the LICENSE file.
  
  // Package csv reads and writes comma-separated values (CSV) files.
  // There are many kinds of CSV files; this package supports the format
  // described in RFC 4180.
  //
  // A csv file contains zero or more records of one or more fields per record.
  // Each record is separated by the newline character. The final record may
  // optionally be followed by a newline character.
  //
  //	field1,field2,field3
  //
  // White space is considered part of a field.
  //
  // Carriage returns before newline characters are silently removed.
  //
  // Blank lines are ignored. A line with only whitespace characters (excluding
  // the ending newline character) is not considered a blank line.
  //
  // Fields which start and stop with the quote character " are called
  // quoted-fields. The beginning and ending quote are not part of the
  // field.
  //
  // The source:
  //
  //	normal string,"quoted-field"
  //
  // results in the fields
  //
  //	{`normal string`, `quoted-field`}
  //
  // Within a quoted-field a quote character followed by a second quote
  // character is considered a single quote.
  //
  //	"the ""word"" is true","a ""quoted-field"""
  //
  // results in
  //
  //	{`the "word" is true`, `a "quoted-field"`}
  //
  // Newlines and commas may be included in a quoted-field
  //
  //	"Multi-line
  //	field","comma is ,"
  //
  // results in
  //
  //	{`Multi-line
  //	field`, `comma is ,`}
  package csv
  
  import (
  	"bufio"
  	"bytes"
  	"errors"
  	"fmt"
  	"io"
  	"unicode"
  	"unicode/utf8"
  )
  
  // A ParseError is returned for parsing errors.
  // Line numbers are 1-indexed and columns are 0-indexed.
  type ParseError struct {
  	StartLine int   // Line where the record starts
  	Line      int   // Line where the error occurred
  	Column    int   // Column (rune index) where the error occurred
  	Err       error // The actual error
  }
  
  func (e *ParseError) Error() string {
  	if e.Err == ErrFieldCount {
  		return fmt.Sprintf("record on line %d: %v", e.Line, e.Err)
  	}
  	if e.StartLine != e.Line {
  		return fmt.Sprintf("record on line %d; parse error on line %d, column %d: %v", e.StartLine, e.Line, e.Column, e.Err)
  	}
  	return fmt.Sprintf("parse error on line %d, column %d: %v", e.Line, e.Column, e.Err)
  }
  
  // These are the errors that can be returned in ParseError.Err.
  var (
  	ErrTrailingComma = errors.New("extra delimiter at end of line") // Deprecated: No longer used.
  	ErrBareQuote     = errors.New("bare \" in non-quoted-field")
  	ErrQuote         = errors.New("extraneous or missing \" in quoted-field")
  	ErrFieldCount    = errors.New("wrong number of fields")
  )
  
  var errInvalidDelim = errors.New("csv: invalid field or comment delimiter")
  
  func validDelim(r rune) bool {
  	return r != 0 && r != '\r' && r != '\n' && utf8.ValidRune(r) && r != utf8.RuneError
  }
  
  // A Reader reads records from a CSV-encoded file.
  //
  // As returned by NewReader, a Reader expects input conforming to RFC 4180.
  // The exported fields can be changed to customize the details before the
  // first call to Read or ReadAll.
  //
  // The Reader converts all \r\n sequences in its input to plain \n,
  // including in multiline field values, so that the returned data does
  // not depend on which line-ending convention an input file uses.
  type Reader struct {
  	// Comma is the field delimiter.
  	// It is set to comma (',') by NewReader.
  	// Comma must be a valid rune and must not be \r, \n,
  	// or the Unicode replacement character (0xFFFD).
  	Comma rune
  
  	// Comment, if not 0, is the comment character. Lines beginning with the
  	// Comment character without preceding whitespace are ignored.
  	// With leading whitespace the Comment character becomes part of the
  	// field, even if TrimLeadingSpace is true.
  	// Comment must be a valid rune and must not be \r, \n,
  	// or the Unicode replacement character (0xFFFD).
  	// It must also not be equal to Comma.
  	Comment rune
  
  	// FieldsPerRecord is the number of expected fields per record.
  	// If FieldsPerRecord is positive, Read requires each record to
  	// have the given number of fields. If FieldsPerRecord is 0, Read sets it to
  	// the number of fields in the first record, so that future records must
  	// have the same field count. If FieldsPerRecord is negative, no check is
  	// made and records may have a variable number of fields.
  	FieldsPerRecord int
  
  	// If LazyQuotes is true, a quote may appear in an unquoted field and a
  	// non-doubled quote may appear in a quoted field.
  	LazyQuotes bool
  
  	// If TrimLeadingSpace is true, leading white space in a field is ignored.
  	// This is done even if the field delimiter, Comma, is white space.
  	TrimLeadingSpace bool
  
  	// ReuseRecord controls whether calls to Read may return a slice sharing
  	// the backing array of the previous call's returned slice for performance.
  	// By default, each call to Read returns newly allocated memory owned by the caller.
  	ReuseRecord bool
  
  	TrailingComma bool // Deprecated: No longer used.
  
  	r *bufio.Reader
  
  	// numLine is the current line being read in the CSV file.
  	numLine int
  
  	// rawBuffer is a line buffer only used by the readLine method.
  	rawBuffer []byte
  
  	// recordBuffer holds the unescaped fields, one after another.
  	// The fields can be accessed by using the indexes in fieldIndexes.
  	// E.g., For the row `a,"b","c""d",e`, recordBuffer will contain `abc"de`
  	// and fieldIndexes will contain the indexes [1, 2, 5, 6].
  	recordBuffer []byte
  
  	// fieldIndexes is an index of fields inside recordBuffer.
  	// The i'th field ends at offset fieldIndexes[i] in recordBuffer.
  	fieldIndexes []int
  
  	// lastRecord is a record cache and only used when ReuseRecord == true.
  	lastRecord []string
  }
  
  // NewReader returns a new Reader that reads from r.
  func NewReader(r io.Reader) *Reader {
  	return &Reader{
  		Comma: ',',
  		r:     bufio.NewReader(r),
  	}
  }
  
  // Read reads one record (a slice of fields) from r.
  // If the record has an unexpected number of fields,
  // Read returns the record along with the error ErrFieldCount.
  // Except for that case, Read always returns either a non-nil
  // record or a non-nil error, but not both.
  // If there is no data left to be read, Read returns nil, io.EOF.
  // If ReuseRecord is true, the returned slice may be shared
  // between multiple calls to Read.
  func (r *Reader) Read() (record []string, err error) {
  	if r.ReuseRecord {
  		record, err = r.readRecord(r.lastRecord)
  		r.lastRecord = record
  	} else {
  		record, err = r.readRecord(nil)
  	}
  	return record, err
  }
  
  // ReadAll reads all the remaining records from r.
  // Each record is a slice of fields.
  // A successful call returns err == nil, not err == io.EOF. Because ReadAll is
  // defined to read until EOF, it does not treat end of file as an error to be
  // reported.
  func (r *Reader) ReadAll() (records [][]string, err error) {
  	for {
  		record, err := r.readRecord(nil)
  		if err == io.EOF {
  			return records, nil
  		}
  		if err != nil {
  			return nil, err
  		}
  		records = append(records, record)
  	}
  }
  
  // readLine reads the next line (with the trailing endline).
  // If EOF is hit without a trailing endline, it will be omitted.
  // If some bytes were read, then the error is never io.EOF.
  // The result is only valid until the next call to readLine.
  func (r *Reader) readLine() ([]byte, error) {
  	line, err := r.r.ReadSlice('\n')
  	if err == bufio.ErrBufferFull {
  		r.rawBuffer = append(r.rawBuffer[:0], line...)
  		for err == bufio.ErrBufferFull {
  			line, err = r.r.ReadSlice('\n')
  			r.rawBuffer = append(r.rawBuffer, line...)
  		}
  		line = r.rawBuffer
  	}
  	if len(line) > 0 && err == io.EOF {
  		err = nil
  		// For backwards compatibility, drop trailing \r before EOF.
  		if line[len(line)-1] == '\r' {
  			line = line[:len(line)-1]
  		}
  	}
  	r.numLine++
  	// Normalize \r\n to \n on all input lines.
  	if n := len(line); n >= 2 && line[n-2] == '\r' && line[n-1] == '\n' {
  		line[n-2] = '\n'
  		line = line[:n-1]
  	}
  	return line, err
  }
  
  // lengthNL reports the number of bytes for the trailing \n.
  func lengthNL(b []byte) int {
  	if len(b) > 0 && b[len(b)-1] == '\n' {
  		return 1
  	}
  	return 0
  }
  
  // nextRune returns the next rune in b or utf8.RuneError.
  func nextRune(b []byte) rune {
  	r, _ := utf8.DecodeRune(b)
  	return r
  }
  
  func (r *Reader) readRecord(dst []string) ([]string, error) {
  	if r.Comma == r.Comment || !validDelim(r.Comma) || (r.Comment != 0 && !validDelim(r.Comment)) {
  		return nil, errInvalidDelim
  	}
  
  	// Read line (automatically skipping past empty lines and any comments).
  	var line, fullLine []byte
  	var errRead error
  	for errRead == nil {
  		line, errRead = r.readLine()
  		if r.Comment != 0 && nextRune(line) == r.Comment {
  			line = nil
  			continue // Skip comment lines
  		}
  		if errRead == nil && len(line) == lengthNL(line) {
  			line = nil
  			continue // Skip empty lines
  		}
  		fullLine = line
  		break
  	}
  	if errRead == io.EOF {
  		return nil, errRead
  	}
  
  	// Parse each field in the record.
  	var err error
  	const quoteLen = len(`"`)
  	commaLen := utf8.RuneLen(r.Comma)
  	recLine := r.numLine // Starting line for record
  	r.recordBuffer = r.recordBuffer[:0]
  	r.fieldIndexes = r.fieldIndexes[:0]
  parseField:
  	for {
  		if r.TrimLeadingSpace {
  			line = bytes.TrimLeftFunc(line, unicode.IsSpace)
  		}
  		if len(line) == 0 || line[0] != '"' {
  			// Non-quoted string field
  			i := bytes.IndexRune(line, r.Comma)
  			field := line
  			if i >= 0 {
  				field = field[:i]
  			} else {
  				field = field[:len(field)-lengthNL(field)]
  			}
  			// Check to make sure a quote does not appear in field.
  			if !r.LazyQuotes {
  				if j := bytes.IndexByte(field, '"'); j >= 0 {
  					col := utf8.RuneCount(fullLine[:len(fullLine)-len(line[j:])])
  					err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrBareQuote}
  					break parseField
  				}
  			}
  			r.recordBuffer = append(r.recordBuffer, field...)
  			r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
  			if i >= 0 {
  				line = line[i+commaLen:]
  				continue parseField
  			}
  			break parseField
  		} else {
  			// Quoted string field
  			line = line[quoteLen:]
  			for {
  				i := bytes.IndexByte(line, '"')
  				if i >= 0 {
  					// Hit next quote.
  					r.recordBuffer = append(r.recordBuffer, line[:i]...)
  					line = line[i+quoteLen:]
  					switch rn := nextRune(line); {
  					case rn == '"':
  						// `""` sequence (append quote).
  						r.recordBuffer = append(r.recordBuffer, '"')
  						line = line[quoteLen:]
  					case rn == r.Comma:
  						// `",` sequence (end of field).
  						line = line[commaLen:]
  						r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
  						continue parseField
  					case lengthNL(line) == len(line):
  						// `"\n` sequence (end of line).
  						r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
  						break parseField
  					case r.LazyQuotes:
  						// `"` sequence (bare quote).
  						r.recordBuffer = append(r.recordBuffer, '"')
  					default:
  						// `"*` sequence (invalid non-escaped quote).
  						col := utf8.RuneCount(fullLine[:len(fullLine)-len(line)-quoteLen])
  						err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote}
  						break parseField
  					}
  				} else if len(line) > 0 {
  					// Hit end of line (copy all data so far).
  					r.recordBuffer = append(r.recordBuffer, line...)
  					if errRead != nil {
  						break parseField
  					}
  					line, errRead = r.readLine()
  					if errRead == io.EOF {
  						errRead = nil
  					}
  					fullLine = line
  				} else {
  					// Abrupt end of file (EOF or error).
  					if !r.LazyQuotes && errRead == nil {
  						col := utf8.RuneCount(fullLine)
  						err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote}
  						break parseField
  					}
  					r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
  					break parseField
  				}
  			}
  		}
  	}
  	if err == nil {
  		err = errRead
  	}
  
  	// Create a single string and create slices out of it.
  	// This pins the memory of the fields together, but allocates once.
  	str := string(r.recordBuffer) // Convert to string once to batch allocations
  	dst = dst[:0]
  	if cap(dst) < len(r.fieldIndexes) {
  		dst = make([]string, len(r.fieldIndexes))
  	}
  	dst = dst[:len(r.fieldIndexes)]
  	var preIdx int
  	for i, idx := range r.fieldIndexes {
  		dst[i] = str[preIdx:idx]
  		preIdx = idx
  	}
  
  	// Check or update the expected fields per record.
  	if r.FieldsPerRecord > 0 {
  		if len(dst) != r.FieldsPerRecord && err == nil {
  			err = &ParseError{StartLine: recLine, Line: recLine, Err: ErrFieldCount}
  		}
  	} else if r.FieldsPerRecord == 0 {
  		r.FieldsPerRecord = len(dst)
  	}
  	return dst, err
  }
  

View as plain text