thomaswilburn/generators.js

## generators.js
// test data -- we also use an identical "test.csv" to check streams
var csv = `
one,two,three
1,2,3
a,b,c
"1,000",1000,"hey there"
`.trim();

// easy numeric indexes for iterables
var forEach = function*(iter) {
  var i = 0;
  for (var v of iter) yield [v, i++];
};

/*

This is the oddest--and most interesting--function of the script, even at only 8 lines.

We want to be able to read from a string one character at a time. However, we also need
to be able to peek ahead at the incoming tokens for parsing, so we need a generator that
can be "backed up." This is not traditionally a thing you can do with iterators, but
it's simple enough here: in JS, when a generator resumes after yielding, the first
argument to `iterator.next()` is used to evaluate in the place of the `yield` expression.
In this case, our peek function can pass in an offset to adjust where we are in the
string (`inc`). Since the generator resumes after yield, the order of this is a little
weird:

* initial run to yield
* on resume, set inc
* cycle to the "next" value and store it
* adjust position based on the inc value passed in
* yield the new value

Using the iterator this way means that our loops look a little different from traditional
for...of iteration, since we need to initialize and store the generator before the loop and
then independently call its `.next()` method. But if you think of these as a stream of
values, instead of a "loop" object, it's a lot easier to conceptualize.

*/
var readChars = function*(input) {
  var inc = 0;
  for (var i = 0; i < input.length; i++) {
    var c = input[i];
    i += inc;
    inc = (yield c) || 0;
  }
};

// readLinesAsync is needed, because streams give us multi-character chunks as input
var readLinesAsync = async function*(input) {
  var buffer = "";
  // wait for the chunk so we can split it
  for await (var chunk of input) {
    for (var c of readChars(chunk)) {
      if (c == "\n") {
        yield buffer;
        buffer = "";
      } else {
        buffer += c;
      }
    }
  }
  yield buffer;
};

var readLines = function*(input) {
  var buffer = "";
  for (var c of input) {
    if (c == "\n") {
      yield buffer;
      buffer = "";
    } else {
      buffer += c;
    }
  }
  yield buffer;
};

// parseLine() does the actual parsing, and yields a stream of cell values from a row
var parseLine = function*(line) {
  var chars = readChars(line);
  var quoting = false;
  var buffer = "";
  for (var c of chars) {
    switch (c) {
      case `"`:
        if (!quoting) {
          quoting = true;
        } else {
          var peek = chars.next(-1).value;
          if (peek && peek != ",") throw "Cell continued after quote character";
          quoting = false;
        }
        continue;

      case ",":
        if (!quoting) {
          yield buffer;
          buffer = "";
          continue;
        }

      default:
        buffer += c;
    }
  }
  if (quoting) throw "Unexpected end of line while still quoted";
  if (buffer.length) yield buffer;
};

// convert values to primitive types
var cast = function(v) {
  if (typeof v != "string") return v;
  if (v == "true") return true;
  if (v == "false") return false;
  if (v.match(/^[\d.]+$/)) return parseFloat(v);
  return v;
};

// processLine assembles cells into a row, and is shared between sync and async code
// although I/O has to be handled asynchronously, we always have individual lines all at once
var processLine = function(line, options) {
  if (options.header) {
    var row = {};
    var parsing = parseLine(line);
    for (var [cell, i] of forEach(parsing)) {
      row[options.header[i]] = options.autoParse ? cast(cell) : cell;
    }
    return row;
  }
  // non-keyed rows
  var row = [...parseLine(line)];
  if (options.autoParse) row = row.map(cast);
  return row;
};

// for streams, use this
var parseCSVAsync = async function*(input, options = { autoParse: true }) {
  var lines = readLinesAsync(input);
  if (options.header === true) {
    var first = (await lines.next()).value;
    options.header = [...parseLine(first)];
  }
  for await (var line of lines) {
    yield processLine(line, options);
  }
};

// for strings, use this
var parseCSV = function*(input, options = { autoParse: true }) {
  var lines = readLines(input);
  if (options.header === true) {
    var first = lines.next().value;
    options.header = [...parseLine(first)];
  }
  for (var line of lines) {
    yield processLine(line, options);
  }
};

console.log("=== With header, not streaming ===");
for (var line of parseCSV(csv, { header: true, autoParse: true })) {
  console.log(line);
}

(async function() {
  var fs = require("fs");
  var stream = fs.createReadStream("test.csv", "utf-8");

  console.log("=== No header, streaming ===");
  for await (var line of parseCSVAsync(stream)) {
    console.log(line)
  }
})();
	// test data -- we also use an identical "test.csv" to check streams
	var csv = `
	one,two,three
	1,2,3
	a,b,c
	"1,000",1000,"hey there"
	`.trim();

	// easy numeric indexes for iterables
	var forEach = function*(iter) {
	var i = 0;
	for (var v of iter) yield [v, i++];
	};

	/*

	This is the oddest--and most interesting--function of the script, even at only 8 lines.

	We want to be able to read from a string one character at a time. However, we also need
	to be able to peek ahead at the incoming tokens for parsing, so we need a generator that
	can be "backed up." This is not traditionally a thing you can do with iterators, but
	it's simple enough here: in JS, when a generator resumes after yielding, the first
	argument to `iterator.next()` is used to evaluate in the place of the `yield` expression.
	In this case, our peek function can pass in an offset to adjust where we are in the
	string (`inc`). Since the generator resumes after yield, the order of this is a little
	weird:

	* initial run to yield
	* on resume, set inc
	* cycle to the "next" value and store it
	* adjust position based on the inc value passed in
	* yield the new value

	Using the iterator this way means that our loops look a little different from traditional
	for...of iteration, since we need to initialize and store the generator before the loop and
	then independently call its `.next()` method. But if you think of these as a stream of
	values, instead of a "loop" object, it's a lot easier to conceptualize.

	*/
	var readChars = function*(input) {
	var inc = 0;
	for (var i = 0; i < input.length; i++) {
	var c = input[i];
	i += inc;
	inc = (yield c) \|\| 0;
	}
	};

	// readLinesAsync is needed, because streams give us multi-character chunks as input
	var readLinesAsync = async function*(input) {
	var buffer = "";
	// wait for the chunk so we can split it
	for await (var chunk of input) {
	for (var c of readChars(chunk)) {
	if (c == "\n") {
	yield buffer;
	buffer = "";
	} else {
	buffer += c;
	}
	}
	}
	yield buffer;
	};

	var readLines = function*(input) {
	var buffer = "";
	for (var c of input) {
	if (c == "\n") {
	yield buffer;
	buffer = "";
	} else {
	buffer += c;
	}
	}
	yield buffer;
	};

	// parseLine() does the actual parsing, and yields a stream of cell values from a row
	var parseLine = function*(line) {
	var chars = readChars(line);
	var quoting = false;
	var buffer = "";
	for (var c of chars) {
	switch (c) {
	case `"`:
	if (!quoting) {
	quoting = true;
	} else {
	var peek = chars.next(-1).value;
	if (peek && peek != ",") throw "Cell continued after quote character";
	quoting = false;
	}
	continue;

	case ",":
	if (!quoting) {
	yield buffer;
	buffer = "";
	continue;
	}

	default:
	buffer += c;
	}
	}
	if (quoting) throw "Unexpected end of line while still quoted";
	if (buffer.length) yield buffer;
	};

	// convert values to primitive types
	var cast = function(v) {
	if (typeof v != "string") return v;
	if (v == "true") return true;
	if (v == "false") return false;
	if (v.match(/^[\d.]+$/)) return parseFloat(v);
	return v;
	};

	// processLine assembles cells into a row, and is shared between sync and async code
	// although I/O has to be handled asynchronously, we always have individual lines all at once
	var processLine = function(line, options) {
	if (options.header) {
	var row = {};
	var parsing = parseLine(line);
	for (var [cell, i] of forEach(parsing)) {
	row[options.header[i]] = options.autoParse ? cast(cell) : cell;
	}
	return row;
	}
	// non-keyed rows
	var row = [...parseLine(line)];
	if (options.autoParse) row = row.map(cast);
	return row;
	};

	// for streams, use this
	var parseCSVAsync = async function*(input, options = { autoParse: true }) {
	var lines = readLinesAsync(input);
	if (options.header === true) {
	var first = (await lines.next()).value;
	options.header = [...parseLine(first)];
	}
	for await (var line of lines) {
	yield processLine(line, options);
	}
	};

	// for strings, use this
	var parseCSV = function*(input, options = { autoParse: true }) {
	var lines = readLines(input);
	if (options.header === true) {
	var first = lines.next().value;
	options.header = [...parseLine(first)];
	}
	for (var line of lines) {
	yield processLine(line, options);
	}
	};

	console.log("=== With header, not streaming ===");
	for (var line of parseCSV(csv, { header: true, autoParse: true })) {
	console.log(line);
	}

	(async function() {
	var fs = require("fs");
	var stream = fs.createReadStream("test.csv", "utf-8");

	console.log("=== No header, streaming ===");
	for await (var line of parseCSVAsync(stream)) {
	console.log(line)
	}
	})();