Skip to content

Instantly share code, notes, and snippets.

@kokes
Created August 24, 2016 14:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kokes/4f3e14beac648f3ec3790ffb8d2ee5a1 to your computer and use it in GitHub Desktop.
Save kokes/4f3e14beac648f3ec3790ffb8d2ee5a1 to your computer and use it in GitHub Desktop.
Replacing rune reading by byte reading to improve performance (15-20%) https://github.com/golang/go/issues/16791
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package csv reads and writes comma-separated values (CSV) files.
// There are many kinds of CSV files; this package supports the format
// described in RFC 4180.
//
// A csv file contains zero or more records of one or more fields per record.
// Each record is separated by the newline character. The final record may
// optionally be followed by a newline character.
//
// field1,field2,field3
//
// White space is considered part of a field.
//
// Carriage returns before newline characters are silently removed.
//
// Blank lines are ignored. A line with only whitespace characters (excluding
// the ending newline character) is not considered a blank line.
//
// Fields which start and stop with the quote character " are called
// quoted-fields. The beginning and ending quote are not part of the
// field.
//
// The source:
//
// normal string,"quoted-field"
//
// results in the fields
//
// {`normal string`, `quoted-field`}
//
// Within a quoted-field a quote character followed by a second quote
// character is considered a single quote.
//
// "the ""word"" is true","a ""quoted-field"""
//
// results in
//
// {`the "word" is true`, `a "quoted-field"`}
//
// Newlines and commas may be included in a quoted-field
//
// "Multi-line
// field","comma is ,"
//
// results in
//
// {`Multi-line
// field`, `comma is ,`}
package csv
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
)
// A ParseError is returned for parsing errors.
// The first line is 1. The first column is 0.
type ParseError struct {
Line int // Line where the error occurred
Column int // Column (rune index) where the error occurred
Err error // The actual error
}
func (e *ParseError) Error() string {
return fmt.Sprintf("line %d, column %d: %s", e.Line, e.Column, e.Err)
}
// These are the errors that can be returned in ParseError.Error
var (
ErrTrailingComma = errors.New("extra delimiter at end of line") // no longer used
ErrBareQuote = errors.New("bare \" in non-quoted-field")
ErrQuote = errors.New("extraneous \" in field")
ErrFieldCount = errors.New("wrong number of fields in line")
)
// A Reader reads records from a CSV-encoded file.
//
// As returned by NewReader, a Reader expects input conforming to RFC 4180.
// The exported fields can be changed to customize the details before the
// first call to Read or ReadAll.
//
//
type Reader struct {
// Comma is the field delimiter.
// It is set to comma (',') by NewReader.
Comma byte
// Comment, if not 0, is the comment character. Lines beginning with the
// Comment character without preceding whitespace are ignored.
// With leading whitespace the Comment character becomes part of the
// field, even if TrimLeadingSpace is true.
Comment byte
// FieldsPerRecord is the number of expected fields per record.
// If FieldsPerRecord is positive, Read requires each record to
// have the given number of fields. If FieldsPerRecord is 0, Read sets it to
// the number of fields in the first record, so that future records must
// have the same field count. If FieldsPerRecord is negative, no check is
// made and records may have a variable number of fields.
FieldsPerRecord int
// If LazyQuotes is true, a quote may appear in an unquoted field and a
// non-doubled quote may appear in a quoted field.
LazyQuotes bool
TrailingComma bool // ignored; here for backwards compatibility
// If TrimLeadingSpace is true, leading white space in a field is ignored.
// This is done even if the field delimiter, Comma, is white space.
TrimLeadingSpace bool
line int
column int
r *bufio.Reader
field bytes.Buffer
}
// NewReader returns a new Reader that reads from r.
func NewReader(r io.Reader) *Reader {
return &Reader{
Comma: ',',
r: bufio.NewReader(r),
}
}
// error creates a new ParseError based on err.
func (r *Reader) error(err error) error {
return &ParseError{
Line: r.line,
Column: r.column,
Err: err,
}
}
// Read reads one record from r. The record is a slice of strings with each
// string representing one field.
func (r *Reader) Read() (record []string, err error) {
for {
record, err = r.parseRecord()
if record != nil {
break
}
if err != nil {
return nil, err
}
}
if r.FieldsPerRecord > 0 {
if len(record) != r.FieldsPerRecord {
r.column = 0 // report at start of record
return record, r.error(ErrFieldCount)
}
} else if r.FieldsPerRecord == 0 {
r.FieldsPerRecord = len(record)
}
return record, nil
}
// ReadAll reads all the remaining records from r.
// Each record is a slice of fields.
// A successful call returns err == nil, not err == io.EOF. Because ReadAll is
// defined to read until EOF, it does not treat end of file as an error to be
// reported.
func (r *Reader) ReadAll() (records [][]string, err error) {
for {
record, err := r.Read()
if err == io.EOF {
return records, nil
}
if err != nil {
return nil, err
}
records = append(records, record)
}
}
// readRune reads one rune from r, folding \r\n to \n and keeping track
// of how far into the line we have read. r.column will point to the start
// of this rune, not the end of this rune.
func (r *Reader) readByte() (byte, error) {
r1, err := r.r.ReadByte()
// Handle \r\n here. We make the simplifying assumption that
// anytime \r is followed by \n that it can be folded to \n.
// We will not detect files which contain both \r\n and bare \n.
if r1 == '\r' {
r1, err = r.r.ReadByte()
if err == nil {
if r1 != '\n' {
r.r.UnreadByte()
r1 = '\r'
}
}
}
r.column++
return r1, err
}
// skip reads runes up to and including the rune delim or until error.
func (r *Reader) skip(delim byte) error {
for {
r1, err := r.readByte()
if err != nil {
return err
}
if r1 == delim {
return nil
}
}
}
// parseRecord reads and parses a single csv record from r.
func (r *Reader) parseRecord() (fields []string, err error) {
// Each record starts on a new line. We increment our line
// number (lines start at 1, not 0) and set column to -1
// so as we increment in readRune it points to the character we read.
r.line++
r.column = -1
// Peek at the first rune. If it is an error we are done.
// If we support comments and it is the comment character
// then skip to the end of line.
r1, err := r.r.ReadByte()
if err != nil {
return nil, err
}
if r.Comment != 0 && r1 == r.Comment {
return nil, r.skip('\n')
}
r.r.UnreadByte()
// At this point we have at least one field.
for {
haveField, delim, err := r.parseField()
if haveField {
// If FieldsPerRecord is greater than 0 we can assume the final
// length of fields to be equal to FieldsPerRecord.
if r.FieldsPerRecord > 0 && fields == nil {
fields = make([]string, 0, r.FieldsPerRecord)
}
fields = append(fields, r.field.String())
}
if delim == '\n' || err == io.EOF {
return fields, err
} else if err != nil {
return nil, err
}
}
}
// parseField parses the next field in the record. The read field is
// located in r.field. Delim is the first character not part of the field
// (r.Comma or '\n').
func (r *Reader) parseField() (haveField bool, delim byte, err error) {
r.field.Reset()
r1, err := r.readByte()
for err == nil && r.TrimLeadingSpace && r1 != '\n' {
r1, err = r.readByte()
}
if err == io.EOF && r.column != 0 {
return true, 0, err
}
if err != nil {
return false, 0, err
}
switch r1 {
case r.Comma:
// will check below
case '\n':
// We are a trailing empty field or a blank line
if r.column == 0 {
return false, r1, nil
}
return true, r1, nil
case '"':
// quoted field
Quoted:
for {
r1, err = r.readByte()
if err != nil {
if err == io.EOF {
if r.LazyQuotes {
return true, 0, err
}
return false, 0, r.error(ErrQuote)
}
return false, 0, err
}
switch r1 {
case '"':
r1, err = r.readByte()
if err != nil || r1 == r.Comma {
break Quoted
}
if r1 == '\n' {
return true, r1, nil
}
if r1 != '"' {
if !r.LazyQuotes {
r.column--
return false, 0, r.error(ErrQuote)
}
// accept the bare quote
r.field.WriteByte('"')
}
case '\n':
r.line++
r.column = -1
}
r.field.WriteByte(r1)
}
default:
// unquoted field
for {
r.field.WriteByte(r1)
r1, err = r.readByte()
if err != nil || r1 == r.Comma {
break
}
if r1 == '\n' {
return true, r1, nil
}
if !r.LazyQuotes && r1 == '"' {
return false, 0, r.error(ErrBareQuote)
}
}
}
if err != nil {
if err == io.EOF {
return true, 0, err
}
return false, 0, err
}
return true, r1, nil
}
60d59
< "unicode"
93c92
< Comma rune
---
> Comma byte
98c97
< Comment rune
---
> Comment byte
182,183c181,182
< func (r *Reader) readRune() (rune, error) {
< r1, _, err := r.r.ReadRune()
---
> func (r *Reader) readByte() (byte, error) {
> r1, err := r.r.ReadByte()
189c188
< r1, _, err = r.r.ReadRune()
---
> r1, err = r.r.ReadByte()
192c191
< r.r.UnreadRune()
---
> r.r.UnreadByte()
202c201
< func (r *Reader) skip(delim rune) error {
---
> func (r *Reader) skip(delim byte) error {
204c203
< r1, err := r.readRune()
---
> r1, err := r.readByte()
226c225
< r1, _, err := r.r.ReadRune()
---
> r1, err := r.r.ReadByte()
234c233
< r.r.UnreadRune()
---
> r.r.UnreadByte()
258c257
< func (r *Reader) parseField() (haveField bool, delim rune, err error) {
---
> func (r *Reader) parseField() (haveField bool, delim byte, err error) {
261,263c260,262
< r1, err := r.readRune()
< for err == nil && r.TrimLeadingSpace && r1 != '\n' && unicode.IsSpace(r1) {
< r1, err = r.readRune()
---
> r1, err := r.readByte()
> for err == nil && r.TrimLeadingSpace && r1 != '\n' {
> r1, err = r.readByte()
288c287
< r1, err = r.readRune()
---
> r1, err = r.readByte()
300c299
< r1, err = r.readRune()
---
> r1, err = r.readByte()
313c312
< r.field.WriteRune('"')
---
> r.field.WriteByte('"')
319c318
< r.field.WriteRune(r1)
---
> r.field.WriteByte(r1)
325,326c324,325
< r.field.WriteRune(r1)
< r1, err = r.readRune()
---
> r.field.WriteByte(r1)
> r1, err = r.readByte()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment