Skip to content

Instantly share code, notes, and snippets.

@creationix
Last active December 11, 2023 15:37
Show Gist options
  • Star 36 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save creationix/5992451 to your computer and use it in GitHub Desktop.
Save creationix/5992451 to your computer and use it in GitHub Desktop.
A streaming JSON parser as an embeddable state machine.
// A streaming byte oriented JSON parser. Feed it a single byte at a time and
// it will emit complete objects as it comes across them. Whitespace within and
// between objects is ignored. This means it can parse newline delimited JSON.
function jsonMachine(emit, next) {
next = next || $value;
return $value;
function $value(byte) {
if (!byte) return;
if (byte === 0x09 || byte === 0x0a || byte === 0x0d || byte === 0x20) {
return $value; // Ignore whitespace
}
if (byte === 0x22) { // "
return stringMachine(onValue);
}
if (byte === 0x2d || (byte >= 0x30 && byte < 0x40)) { // - or 0-9
return numberMachine(byte, onNumber);
}
if (byte === 0x7b) { // {
return objectMachine(onValue);
}
if (byte === 0x5b) { // [
return arrayMachine(onValue);
}
if (byte === 0x74) { // t
return constantMachine(TRUE, true, onValue);
}
if (byte === 0x66) { // f
return constantMachine(FALSE, false, onValue);
}
if (byte === 0x6e) { // n
return constantMachine(NULL, null, onValue);
}
if (next === $value) {
throw new Error("Unexpected 0x" + byte.toString(16));
}
return next(byte);
}
function onValue(value) {
emit(value);
return next;
}
function onNumber(number, byte) {
emit(number);
return $value(byte);
}
}
var TRUE = [0x72, 0x75, 0x65];
var FALSE = [0x61, 0x6c, 0x73, 0x65];
var NULL = [0x75, 0x6c, 0x6c];
function constantMachine(bytes, value, emit) {
var i = 0, l = bytes.length;
return $constant;
function $constant(byte) {
if (byte !== bytes[i++]) {
throw new Error("Unexpected 0x" + byte.toString(16));
}
if (i < l) return $constant;
return emit(value);
}
}
function stringMachine(emit) {
var string = "";
return $string;
function $string(byte) {
if (byte === 0x22) { // "
return emit(string);
}
if (byte === 0x5c) { // \
return $escapedString;
}
if (byte & 0x80) { // UTF-8 handling
return utf8Machine(byte, onCharCode);
}
if (byte < 0x20) { // ASCII control character
throw new Error("Unexpected control character: 0x" + byte.toString(16));
}
string += String.fromCharCode(byte);
return $string;
}
function $escapedString(byte) {
if (byte === 0x22 || byte === 0x5c || byte === 0x2f) { // " \ /
string += String.fromCharCode(byte);
return $string;
}
if (byte === 0x62) { // b
string += "\b";
return $string;
}
if (byte === 0x66) { // f
string += "\f";
return $string;
}
if (byte === 0x6e) { // n
string += "\n";
return $string;
}
if (byte === 0x72) { // r
string += "\r";
return $string;
}
if (byte === 0x74) { // t
string += "\t";
return $string;
}
if (byte === 0x75) { // u
return hexMachine(onCharCode);
}
}
function onCharCode(charCode) {
string += String.fromCharCode(charCode);
return $string;
}
}
// Nestable state machine for UTF-8 Decoding.
function utf8Machine(byte, emit) {
var left = 0, num = 0;
if (byte >= 0xc0 && byte < 0xe0) { // 2-byte UTF-8 Character
left = 1;
num = (byte & 0x1f) << 6;
return $utf8;
}
if (byte >= 0xe0 && byte < 0xf0) { // 3-byte UTF-8 Character
left = 2;
num = (byte & 0xf) << 12;
return $utf8;
}
if (byte >= 0xf0 && byte < 0xf8) { // 4-byte UTF-8 Character
left = 3;
num = (byte & 0x07) << 18;
return $utf8;
}
throw new Error("Invalid byte in UTF-8 string: 0x" + byte.toString(16));
function $utf8(byte) {
if ((byte & 0xc0) !== 0x80) {
throw new Error("Invalid byte in UTF-8 character: 0x" + byte.toString(16));
}
num |= (byte & 0x3f) << (--left * 6);
if (left) return $utf8;
return emit(num);
}
}
// Nestable state machine for hex escaped characters
function hexMachine(emit) {
var left = 4, num = 0;
return $hex;
function $hex(byte) {
var i = 0; // Parse the hex byte
if (byte >= 0x30 && byte < 0x40) i = byte - 0x30;
else if (byte >= 0x61 && byte <= 0x66) i = byte - 0x57;
else if (byte >= 0x41 && byte <= 0x46) i = byte - 0x37;
else throw new Error("Expected hex char in string hex escape");
num |= i << (--left * 4);
if (left) return $hex;
return emit(num);
}
}
function numberMachine(byte, emit) {
var sign = 1;
var number = 0;
var decimal = 0;
var esign = 1;
var exponent = 0;
if (byte === 0x2d) { // -
sign = -1;
return $start;
}
return $start(byte);
function $start(byte) {
if (byte === 0x30) {
return $mid;
}
if (byte > 0x30 && byte < 0x40) {
return $number(byte);
}
throw new Error("Invalid number: 0x" + byte.toString(16));
}
function $mid(byte) {
if (byte === 0x2e) { // .
return $decimal;
}
return $later(byte);
}
function $number(byte) {
if (byte >= 0x30 && byte < 0x40) {
number = number * 10 + (byte - 0x30);
return $number;
}
return $mid(byte);
}
function $decimal(byte) {
if (byte >= 0x30 && byte < 0x40) {
decimal = (decimal + byte - 0x30) / 10;
return $decimal;
}
return $later(byte);
}
function $later(byte) {
if (byte === 0x45 || byte === 0x65) { // E e
return $esign;
}
return $done(byte);
}
function $esign(byte) {
if (byte === 0x2b) { // +
return $exponent;
}
if (byte === 0x2d) { // -
esign = -1;
return $exponent;
}
return $exponent(byte);
}
function $exponent(byte) {
if (byte >= 0x30 && byte < 0x40) {
exponent = exponent * 10 + (byte - 0x30);
return $exponent;
}
return $done(byte);
}
function $done(byte) {
var value = sign * (number + decimal);
if (exponent) {
value *= Math.pow(10, esign * exponent);
}
return emit(value, byte);
}
}
function arrayMachine(emit) {
var array = [];
return $array;
function $array(byte) {
if (byte === 0x5d) { // ]
return emit(array);
}
return jsonMachine(onValue, $comma)(byte);
}
function onValue(value) {
array.push(value);
}
function $comma(byte) {
if (byte === 0x09 || byte === 0x0a || byte === 0x0d || byte === 0x20) {
return $comma; // Ignore whitespace
}
if (byte === 0x2c) { // ,
return jsonMachine(onValue, $comma);
}
if (byte === 0x5d) { // ]
return emit(array);
}
throw new Error("Unexpected byte: 0x" + byte.toString(16) + " in array body");
}
}
function objectMachine(emit) {
var object = {};
var key;
return $object;
function $object(byte) {
if (byte === 0x7d) { // }
return emit(object);
}
return $key(byte);
}
function $key(byte) {
if (byte === 0x09 || byte === 0x0a || byte === 0x0d || byte === 0x20) {
return $object; // Ignore whitespace
}
if (byte === 0x22) {
return stringMachine(onKey);
}
throw new Error("Unexpected byte: 0x" + byte.toString(16));
}
function onKey(result) {
key = result;
return $colon;
}
function $colon(byte) {
if (byte === 0x09 || byte === 0x0a || byte === 0x0d || byte === 0x20) {
return $colon; // Ignore whitespace
}
if (byte === 0x3a) { // :
return jsonMachine(onValue, $comma);
}
throw new Error("Unexpected byte: 0x" + byte.toString(16));
}
function onValue(value) {
object[key] = value;
}
function $comma(byte) {
if (byte === 0x09 || byte === 0x0a || byte === 0x0d || byte === 0x20) {
return $comma; // Ignore whitespace
}
if (byte === 0x2c) { // ,
return $key;
}
if (byte === 0x7d) { // }
return emit(object);
}
throw new Error("Unexpected byte: 0x" + byte.toString(16));
}
}
var inspect = require("util").inspect;
var inputs = [
'"this is a \\u5ee9 string" "so is this €"\r\n"How about ¢?"\t"詩檧窣廩 禨碜婨, 珦覵 氨焨鋨"',
'["a",1,[1,2,3]]',
'12345 6789',
'{"name":"Tim Caswell","age":31,"true":true,"false":false,"null":null}',
'-1 -1.1 -0.3 3.14e-3 10E5'
];
inputs.forEach(function (input) {
var data = new Buffer(input);
var state = jsonMachine(emit);
for (var i = 0, l = data.length; i < l; i++) {
state = state(data[i]);
}
state();
});
function emit(value) {
console.log(inspect(value, {colors:true}));
}
@creationix
Copy link
Author

To hook this up to a node stream with data and end events, you can do the following:

var input = require('fs').createReadStream("my-json-data");
var state = jsonMachine(emit);
input.on("data", function (chunk) {
  for (var i = 0, l = chunk.length; i < l; i++) {
    state = state(chunk[i]);
  }
});
input.on("end", function () {
  // Tell the parser we're done and to flush any remaining data
  state();
  // At this point, no more data will emit.
});

function emit(output) {
  console.log("OUTPUT", output);
}

@nhrones
Copy link

nhrones commented Feb 9, 2015

I can't follow this code. I come from a C# background. Can you explain the functions that are named with a $? I don't see where they are ever called.
I had the same problem digesting the parser in your ' chrome-app-module-loader'.
I'm currently writing some non-trivial chrome apps, and I really like the idea of your loader that allows modules to be developed with commonJS syntax.

@stephenhandley
Copy link

me neither dude

@creationix
Copy link
Author

The $ in the name doesn't mean anything. It could be a Z or an _ and the language doesn't care.

I think what's confusing to many is this code returns functions as values in a lot of places and then calls those returned function values later. You'll need a good understanding of first class functions to understand this code. It's common in JavaScript (which has a lot of design from scheme), but not so much in languages like C# or Java.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment