Skip to content

Instantly share code, notes, and snippets.

@thomaswilburn
Last active September 14, 2018 18:57
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thomaswilburn/e3ee6189bd5b1f51a1fbc7636b64a3de to your computer and use it in GitHub Desktop.
Save thomaswilburn/e3ee6189bd5b1f51a1fbc7636b64a3de to your computer and use it in GitHub Desktop.
CSV iterators
// test data -- we also use an identical "test.csv" to check streams
var csv = `
one,two,three
1,2,3
a,b,c
"1,000",1000,"hey there"
`.trim();
// easy numeric indexes for iterables
var forEach = function*(iter) {
var i = 0;
for (var v of iter) yield [v, i++];
};
/*
This is the oddest--and most interesting--function of the script, even at only 8 lines.
We want to be able to read from a string one character at a time. However, we also need
to be able to peek ahead at the incoming tokens for parsing, so we need a generator that
can be "backed up." This is not traditionally a thing you can do with iterators, but
it's simple enough here: in JS, when a generator resumes after yielding, the first
argument to `iterator.next()` is used to evaluate in the place of the `yield` expression.
In this case, our peek function can pass in an offset to adjust where we are in the
string (`inc`). Since the generator resumes after yield, the order of this is a little
weird:
* initial run to yield
* on resume, set inc
* cycle to the "next" value and store it
* adjust position based on the inc value passed in
* yield the new value
Using the iterator this way means that our loops look a little different from traditional
for...of iteration, since we need to initialize and store the generator before the loop and
then independently call its `.next()` method. But if you think of these as a stream of
values, instead of a "loop" object, it's a lot easier to conceptualize.
*/
var readChars = function*(input) {
var inc = 0;
for (var i = 0; i < input.length; i++) {
var c = input[i];
i += inc;
inc = (yield c) || 0;
}
};
// readLinesAsync is needed, because streams give us multi-character chunks as input
var readLinesAsync = async function*(input) {
var buffer = "";
// wait for the chunk so we can split it
for await (var chunk of input) {
for (var c of readChars(chunk)) {
if (c == "\n") {
yield buffer;
buffer = "";
} else {
buffer += c;
}
}
}
yield buffer;
};
var readLines = function*(input) {
var buffer = "";
for (var c of input) {
if (c == "\n") {
yield buffer;
buffer = "";
} else {
buffer += c;
}
}
yield buffer;
};
// parseLine() does the actual parsing, and yields a stream of cell values from a row
var parseLine = function*(line) {
var chars = readChars(line);
var quoting = false;
var buffer = "";
for (var c of chars) {
switch (c) {
case `"`:
if (!quoting) {
quoting = true;
} else {
var peek = chars.next(-1).value;
if (peek && peek != ",") throw "Cell continued after quote character";
quoting = false;
}
continue;
case ",":
if (!quoting) {
yield buffer;
buffer = "";
continue;
}
default:
buffer += c;
}
}
if (quoting) throw "Unexpected end of line while still quoted";
if (buffer.length) yield buffer;
};
// convert values to primitive types
var cast = function(v) {
if (typeof v != "string") return v;
if (v == "true") return true;
if (v == "false") return false;
if (v.match(/^[\d.]+$/)) return parseFloat(v);
return v;
};
// processLine assembles cells into a row, and is shared between sync and async code
// although I/O has to be handled asynchronously, we always have individual lines all at once
var processLine = function(line, options) {
if (options.header) {
var row = {};
var parsing = parseLine(line);
for (var [cell, i] of forEach(parsing)) {
row[options.header[i]] = options.autoParse ? cast(cell) : cell;
}
return row;
}
// non-keyed rows
var row = [...parseLine(line)];
if (options.autoParse) row = row.map(cast);
return row;
};
// for streams, use this
var parseCSVAsync = async function*(input, options = { autoParse: true }) {
var lines = readLinesAsync(input);
if (options.header === true) {
var first = (await lines.next()).value;
options.header = [...parseLine(first)];
}
for await (var line of lines) {
yield processLine(line, options);
}
};
// for strings, use this
var parseCSV = function*(input, options = { autoParse: true }) {
var lines = readLines(input);
if (options.header === true) {
var first = lines.next().value;
options.header = [...parseLine(first)];
}
for (var line of lines) {
yield processLine(line, options);
}
};
console.log("=== With header, not streaming ===");
for (var line of parseCSV(csv, { header: true, autoParse: true })) {
console.log(line);
}
(async function() {
var fs = require("fs");
var stream = fs.createReadStream("test.csv", "utf-8");
console.log("=== No header, streaming ===");
for await (var line of parseCSVAsync(stream)) {
console.log(line)
}
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment