Skip to content

Instantly share code, notes, and snippets.

Last active May 10, 2024 14:36
Show Gist options
  • Save creationix/5992451 to your computer and use it in GitHub Desktop.
Save creationix/5992451 to your computer and use it in GitHub Desktop.
A streaming JSON parser as an embeddable state machine.
// A streaming byte oriented JSON parser. Feed it a single byte at a time and
// it will emit complete objects as it comes across them. Whitespace within and
// between objects is ignored. This means it can parse newline delimited JSON.
function jsonMachine(emit, next) {
next = next || $value;
return $value;
function $value(byte) {
if (!byte) return;
if (byte === 0x09 || byte === 0x0a || byte === 0x0d || byte === 0x20) {
return $value; // Ignore whitespace
if (byte === 0x22) { // "
return stringMachine(onValue);
if (byte === 0x2d || (byte >= 0x30 && byte < 0x40)) { // - or 0-9
return numberMachine(byte, onNumber);
if (byte === 0x7b) { // {
return objectMachine(onValue);
if (byte === 0x5b) { // [
return arrayMachine(onValue);
if (byte === 0x74) { // t
return constantMachine(TRUE, true, onValue);
if (byte === 0x66) { // f
return constantMachine(FALSE, false, onValue);
if (byte === 0x6e) { // n
return constantMachine(NULL, null, onValue);
if (next === $value) {
throw new Error("Unexpected 0x" + byte.toString(16));
return next(byte);
function onValue(value) {
return next;
function onNumber(number, byte) {
return $value(byte);
var TRUE = [0x72, 0x75, 0x65];
var FALSE = [0x61, 0x6c, 0x73, 0x65];
var NULL = [0x75, 0x6c, 0x6c];
function constantMachine(bytes, value, emit) {
var i = 0, l = bytes.length;
return $constant;
function $constant(byte) {
if (byte !== bytes[i++]) {
throw new Error("Unexpected 0x" + byte.toString(16));
if (i < l) return $constant;
return emit(value);
function stringMachine(emit) {
var string = "";
return $string;
function $string(byte) {
if (byte === 0x22) { // "
return emit(string);
if (byte === 0x5c) { // \
return $escapedString;
if (byte & 0x80) { // UTF-8 handling
return utf8Machine(byte, onCharCode);
if (byte < 0x20) { // ASCII control character
throw new Error("Unexpected control character: 0x" + byte.toString(16));
string += String.fromCharCode(byte);
return $string;
function $escapedString(byte) {
if (byte === 0x22 || byte === 0x5c || byte === 0x2f) { // " \ /
string += String.fromCharCode(byte);
return $string;
if (byte === 0x62) { // b
string += "\b";
return $string;
if (byte === 0x66) { // f
string += "\f";
return $string;
if (byte === 0x6e) { // n
string += "\n";
return $string;
if (byte === 0x72) { // r
string += "\r";
return $string;
if (byte === 0x74) { // t
string += "\t";
return $string;
if (byte === 0x75) { // u
return hexMachine(onCharCode);
function onCharCode(charCode) {
string += String.fromCharCode(charCode);
return $string;
// Nestable state machine for UTF-8 Decoding.
function utf8Machine(byte, emit) {
var left = 0, num = 0;
if (byte >= 0xc0 && byte < 0xe0) { // 2-byte UTF-8 Character
left = 1;
num = (byte & 0x1f) << 6;
return $utf8;
if (byte >= 0xe0 && byte < 0xf0) { // 3-byte UTF-8 Character
left = 2;
num = (byte & 0xf) << 12;
return $utf8;
if (byte >= 0xf0 && byte < 0xf8) { // 4-byte UTF-8 Character
left = 3;
num = (byte & 0x07) << 18;
return $utf8;
throw new Error("Invalid byte in UTF-8 string: 0x" + byte.toString(16));
function $utf8(byte) {
if ((byte & 0xc0) !== 0x80) {
throw new Error("Invalid byte in UTF-8 character: 0x" + byte.toString(16));
num |= (byte & 0x3f) << (--left * 6);
if (left) return $utf8;
return emit(num);
// Nestable state machine for hex escaped characters
function hexMachine(emit) {
var left = 4, num = 0;
return $hex;
function $hex(byte) {
var i = 0; // Parse the hex byte
if (byte >= 0x30 && byte < 0x40) i = byte - 0x30;
else if (byte >= 0x61 && byte <= 0x66) i = byte - 0x57;
else if (byte >= 0x41 && byte <= 0x46) i = byte - 0x37;
else throw new Error("Expected hex char in string hex escape");
num |= i << (--left * 4);
if (left) return $hex;
return emit(num);
function numberMachine(byte, emit) {
var sign = 1;
var number = 0;
var decimal = 0;
var esign = 1;
var exponent = 0;
if (byte === 0x2d) { // -
sign = -1;
return $start;
return $start(byte);
function $start(byte) {
if (byte === 0x30) {
return $mid;
if (byte > 0x30 && byte < 0x40) {
return $number(byte);
throw new Error("Invalid number: 0x" + byte.toString(16));
function $mid(byte) {
if (byte === 0x2e) { // .
return $decimal;
return $later(byte);
function $number(byte) {
if (byte >= 0x30 && byte < 0x40) {
number = number * 10 + (byte - 0x30);
return $number;
return $mid(byte);
function $decimal(byte) {
if (byte >= 0x30 && byte < 0x40) {
decimal = (decimal + byte - 0x30) / 10;
return $decimal;
return $later(byte);
function $later(byte) {
if (byte === 0x45 || byte === 0x65) { // E e
return $esign;
return $done(byte);
function $esign(byte) {
if (byte === 0x2b) { // +
return $exponent;
if (byte === 0x2d) { // -
esign = -1;
return $exponent;
return $exponent(byte);
function $exponent(byte) {
if (byte >= 0x30 && byte < 0x40) {
exponent = exponent * 10 + (byte - 0x30);
return $exponent;
return $done(byte);
function $done(byte) {
var value = sign * (number + decimal);
if (exponent) {
value *= Math.pow(10, esign * exponent);
return emit(value, byte);
function arrayMachine(emit) {
var array = [];
return $array;
function $array(byte) {
if (byte === 0x5d) { // ]
return emit(array);
return jsonMachine(onValue, $comma)(byte);
function onValue(value) {
function $comma(byte) {
if (byte === 0x09 || byte === 0x0a || byte === 0x0d || byte === 0x20) {
return $comma; // Ignore whitespace
if (byte === 0x2c) { // ,
return jsonMachine(onValue, $comma);
if (byte === 0x5d) { // ]
return emit(array);
throw new Error("Unexpected byte: 0x" + byte.toString(16) + " in array body");
function objectMachine(emit) {
var object = {};
var key;
return $object;
function $object(byte) {
if (byte === 0x7d) { // }
return emit(object);
return $key(byte);
function $key(byte) {
if (byte === 0x09 || byte === 0x0a || byte === 0x0d || byte === 0x20) {
return $object; // Ignore whitespace
if (byte === 0x22) {
return stringMachine(onKey);
throw new Error("Unexpected byte: 0x" + byte.toString(16));
function onKey(result) {
key = result;
return $colon;
function $colon(byte) {
if (byte === 0x09 || byte === 0x0a || byte === 0x0d || byte === 0x20) {
return $colon; // Ignore whitespace
if (byte === 0x3a) { // :
return jsonMachine(onValue, $comma);
throw new Error("Unexpected byte: 0x" + byte.toString(16));
function onValue(value) {
object[key] = value;
function $comma(byte) {
if (byte === 0x09 || byte === 0x0a || byte === 0x0d || byte === 0x20) {
return $comma; // Ignore whitespace
if (byte === 0x2c) { // ,
return $key;
if (byte === 0x7d) { // }
return emit(object);
throw new Error("Unexpected byte: 0x" + byte.toString(16));
var inspect = require("util").inspect;
var inputs = [
'"this is a \\u5ee9 string" "so is this €"\r\n"How about ¢?"\t"詩檧窣廩 禨碜婨, 珦覵 氨焨鋨"',
'12345 6789',
'{"name":"Tim Caswell","age":31,"true":true,"false":false,"null":null}',
'-1 -1.1 -0.3 3.14e-3 10E5'
inputs.forEach(function (input) {
var data = new Buffer(input);
var state = jsonMachine(emit);
for (var i = 0, l = data.length; i < l; i++) {
state = state(data[i]);
function emit(value) {
console.log(inspect(value, {colors:true}));
Copy link

The $ in the name doesn't mean anything. It could be a Z or an _ and the language doesn't care.

I think what's confusing to many is this code returns functions as values in a lot of places and then calls those returned function values later. You'll need a good understanding of first class functions to understand this code. It's common in JavaScript (which has a lot of design from scheme), but not so much in languages like C# or Java.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment