-
-
Save kokes/3924255b9629b0bef87f8127d473cd56 to your computer and use it in GitHub Desktop.
csv reading with feature parity, take two
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bytes" | |
"encoding/csv" | |
"fmt" | |
"io" | |
"io/ioutil" | |
"log" | |
"time" | |
) | |
var fnm string = "names.csv" | |
func main() { | |
dt, ee := ioutil.ReadFile(fnm) | |
if ee != nil { | |
panic(ee) | |
} | |
bf := bytes.NewReader(dt) | |
rd := NewReader(bf) | |
t := time.Now() | |
rdt, err := rd.ReadAll() | |
fmt.Println(time.Since(t)) | |
fmt.Println("nacteno ", len(rdt)) | |
if err != nil && err != io.EOF { | |
log.Fatal(err) | |
} | |
// canonical read | |
bf.Seek(0, 0) | |
crd := csv.NewReader(bf) | |
cdt, _ := crd.ReadAll() | |
// fmt.Println(strings.Join(rdt[1], "\n")) | |
// fmt.Println(rdt) | |
for j, vl := range rdt { | |
for k, el := range vl { | |
if cdt[j][k] != el { | |
// fmt.Println(strings.HasSuffix(el, "\r")) | |
// fmt.Println(strings.HasSuffix(cdt[j][k], "\r")) | |
log.Fatal(j, k, "SHOULD BE: ", cdt[j][k], " HAVE ", el, " len ", len(cdt[j][k]), "+++", len(el)) | |
} | |
} | |
} | |
} | |
// TODO: | |
// empty lines | |
// eat all carriage returns | |
// lazyquotes | |
// actual quotes | |
// comments | |
// fieldsperrecord | |
// trimleadingspace | |
// Perf gains: | |
// not appending single bytes (runes), slicing as much as possible | |
// operating on bytes, not runes | |
// not checking '\r' equality for each byte, only when '\n' hit | |
type Reader struct { | |
r io.Reader | |
sep byte | |
buf []byte | |
bufret int // what to retain from the current buffer | |
pbuf []byte // accumulated previous buffers | |
pbufn int // length of data in the previous buffer (if not full) | |
} | |
func NewReader(r io.Reader) *Reader { | |
n := 256 | |
return &Reader{ | |
r: r, | |
sep: ',', | |
buf: make([]byte, n, n), | |
pbuf: make([]byte, 0), | |
} | |
} | |
func (r *Reader) prevByte(j int) byte { | |
if j == 0 { | |
if r.pbufn > 0 { | |
return r.pbuf[r.pbufn-1] | |
} | |
return 0 | |
} | |
return r.buf[j-1] | |
} | |
// TODO: does not emit errors | |
func (r *Reader) Read() (row []string, err error) { | |
var infield, inquotedfield bool | |
infield = true | |
var pfs int // previous field start | |
for err != io.EOF { | |
// if infield -> copy r.buf into an intermediate one | |
var nn int | |
nn, err = r.r.Read(r.buf[r.bufret:]) | |
_ = nn | |
pfs = 0 | |
for j, ch := range r.buf { | |
switch ch { | |
case r.sep, '\n': | |
// separator or newline in a quoted field has no function | |
if inquotedfield { | |
// TODO: if prevbyte == \r, then write the existing slice to pbuf and 'eat' \r here | |
continue | |
} | |
lel := j // last element to be drawn from the buffer | |
// trim \r if needed (either from the buffer or from the previous buffer) | |
if ch == '\n' && r.prevByte(j) == '\r' { | |
if j == 0 { | |
r.pbufn-- | |
} else { | |
lel-- | |
} | |
} | |
// trim the quote if it ends the previous element | |
// TODO: what if it wasn't quoted? Say | |
// ,field with a lazy quote", | |
if ch == r.sep && r.prevByte(j) == '"' { | |
lel = j - 1 | |
} | |
if r.pbufn > 0 { | |
bb := append(r.pbuf[:r.pbufn], r.buf[pfs:lel]...) | |
row = append(row, string(bb)) | |
r.pbufn = 0 | |
} else { | |
// fmt.Println(string(r.buf[pfs:lel])) | |
row = append(row, string(r.buf[pfs:lel])) | |
} | |
pfs = j + 1 // TODO: what if j == len(buf) - 1? | |
if ch == r.sep { | |
infield = !infield // if was in a field, now isn't and vice versa | |
} else { | |
// newline | |
copy(r.buf, r.buf[j+1:]) // save the ending | |
// fmt.Println("saving", cap(r.buf), j) | |
r.bufret = cap(r.buf) - j - 1 | |
return | |
} | |
case '"': | |
if inquotedfield { | |
inquotedfield = false | |
} | |
if !inquotedfield && (r.prevByte(j) == r.sep) { | |
inquotedfield = true | |
pfs = j + 1 | |
} | |
default: | |
infield = true | |
} | |
} | |
// fmt.Println("adding", infield) // TODO: here's a bug - we should be in a field | |
// finished reading the buffer, but still in a field | |
if infield { | |
// copy AFTER r.pbufn + verify that r.pbufn + len()-pfs still fits in the buffer, otherwise append | |
// copy(r.pbuf, r.buf[pfs:]) | |
r.pbuf = append(r.pbuf[:r.pbufn], r.buf[pfs:]...) | |
// fmt.Println("====", string(r.pbuf[:r.pbufn]), "+=+", string(r.buf[pfs:])) | |
// fmt.Println(cap(r.buf) - pfs) | |
r.pbufn += cap(r.buf) - pfs | |
} | |
} | |
// no newline, just EOF, return then | |
return | |
} | |
func (r *Reader) ReadAll() (rt [][]string, err error) { | |
var line []string | |
for err != io.EOF { | |
line, err = r.Read() | |
rt = append(rt, line) | |
} | |
// the 'previous' buffer may grow (it's now as big as the biggest field), | |
// so let's force a GC clean | |
r.pbuf = nil | |
return rt, err | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment