Skip to content

Instantly share code, notes, and snippets.

@kokes
Created September 5, 2016 09:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kokes/3924255b9629b0bef87f8127d473cd56 to your computer and use it in GitHub Desktop.
Save kokes/3924255b9629b0bef87f8127d473cd56 to your computer and use it in GitHub Desktop.
csv reading with feature parity, take two
package main
import (
"bytes"
"encoding/csv"
"fmt"
"io"
"io/ioutil"
"log"
"time"
)
var fnm string = "names.csv"
func main() {
dt, ee := ioutil.ReadFile(fnm)
if ee != nil {
panic(ee)
}
bf := bytes.NewReader(dt)
rd := NewReader(bf)
t := time.Now()
rdt, err := rd.ReadAll()
fmt.Println(time.Since(t))
fmt.Println("nacteno ", len(rdt))
if err != nil && err != io.EOF {
log.Fatal(err)
}
// canonical read
bf.Seek(0, 0)
crd := csv.NewReader(bf)
cdt, _ := crd.ReadAll()
// fmt.Println(strings.Join(rdt[1], "\n"))
// fmt.Println(rdt)
for j, vl := range rdt {
for k, el := range vl {
if cdt[j][k] != el {
// fmt.Println(strings.HasSuffix(el, "\r"))
// fmt.Println(strings.HasSuffix(cdt[j][k], "\r"))
log.Fatal(j, k, "SHOULD BE: ", cdt[j][k], " HAVE ", el, " len ", len(cdt[j][k]), "+++", len(el))
}
}
}
}
// TODO:
// empty lines
// eat all carriage returns
// lazyquotes
// actual quotes
// comments
// fieldsperrecord
// trimleadingspace
// Perf gains:
// not appending single bytes (runes), slicing as much as possible
// operating on bytes, not runes
// not checking '\r' equality for each byte, only when '\n' hit
type Reader struct {
r io.Reader
sep byte
buf []byte
bufret int // what to retain from the current buffer
pbuf []byte // accumulated previous buffers
pbufn int // length of data in the previous buffer (if not full)
}
func NewReader(r io.Reader) *Reader {
n := 256
return &Reader{
r: r,
sep: ',',
buf: make([]byte, n, n),
pbuf: make([]byte, 0),
}
}
func (r *Reader) prevByte(j int) byte {
if j == 0 {
if r.pbufn > 0 {
return r.pbuf[r.pbufn-1]
}
return 0
}
return r.buf[j-1]
}
// TODO: does not emit errors
func (r *Reader) Read() (row []string, err error) {
var infield, inquotedfield bool
infield = true
var pfs int // previous field start
for err != io.EOF {
// if infield -> copy r.buf into an intermediate one
var nn int
nn, err = r.r.Read(r.buf[r.bufret:])
_ = nn
pfs = 0
for j, ch := range r.buf {
switch ch {
case r.sep, '\n':
// separator or newline in a quoted field has no function
if inquotedfield {
// TODO: if prevbyte == \r, then write the existing slice to pbuf and 'eat' \r here
continue
}
lel := j // last element to be drawn from the buffer
// trim \r if needed (either from the buffer or from the previous buffer)
if ch == '\n' && r.prevByte(j) == '\r' {
if j == 0 {
r.pbufn--
} else {
lel--
}
}
// trim the quote if it ends the previous element
// TODO: what if it wasn't quoted? Say
// ,field with a lazy quote",
if ch == r.sep && r.prevByte(j) == '"' {
lel = j - 1
}
if r.pbufn > 0 {
bb := append(r.pbuf[:r.pbufn], r.buf[pfs:lel]...)
row = append(row, string(bb))
r.pbufn = 0
} else {
// fmt.Println(string(r.buf[pfs:lel]))
row = append(row, string(r.buf[pfs:lel]))
}
pfs = j + 1 // TODO: what if j == len(buf) - 1?
if ch == r.sep {
infield = !infield // if was in a field, now isn't and vice versa
} else {
// newline
copy(r.buf, r.buf[j+1:]) // save the ending
// fmt.Println("saving", cap(r.buf), j)
r.bufret = cap(r.buf) - j - 1
return
}
case '"':
if inquotedfield {
inquotedfield = false
}
if !inquotedfield && (r.prevByte(j) == r.sep) {
inquotedfield = true
pfs = j + 1
}
default:
infield = true
}
}
// fmt.Println("adding", infield) // TODO: here's a bug - we should be in a field
// finished reading the buffer, but still in a field
if infield {
// copy AFTER r.pbufn + verify that r.pbufn + len()-pfs still fits in the buffer, otherwise append
// copy(r.pbuf, r.buf[pfs:])
r.pbuf = append(r.pbuf[:r.pbufn], r.buf[pfs:]...)
// fmt.Println("====", string(r.pbuf[:r.pbufn]), "+=+", string(r.buf[pfs:]))
// fmt.Println(cap(r.buf) - pfs)
r.pbufn += cap(r.buf) - pfs
}
}
// no newline, just EOF, return then
return
}
func (r *Reader) ReadAll() (rt [][]string, err error) {
var line []string
for err != io.EOF {
line, err = r.Read()
rt = append(rt, line)
}
// the 'previous' buffer may grow (it's now as big as the biggest field),
// so let's force a GC clean
r.pbuf = nil
return rt, err
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment