Skip to content

Instantly share code, notes, and snippets.

@melwyn95
Last active July 2, 2020 17:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save melwyn95/7d4bc7a71e31c51ede4cbb2ce1272d1e to your computer and use it in GitHub Desktop.
Save melwyn95/7d4bc7a71e31c51ede4cbb2ce1272d1e to your computer and use it in GitHub Desktop.
JSON parser
// TOKEN types
const COLON = 'COLON';
const L_CURLY = 'L_CURLY';
const R_CURLY = 'R_CURLY';
const L_SQUARE = 'L_SQUARE';
const R_SQUARE = 'R_SQUARE';
const COMMA = 'COMMA';
const BOOLEAN_LITERAL = 'BOOLEAN_LITERAL';
const NUMBER_LITERAL = 'NUMBER_LITERAL';
const STRING_LITERAL = 'STRING_LITERAL';
const NULL_LITERAL = 'NULL_LITERAL';
// TOKEN raw string
const COLON_STR = ':';
const L_CURLY_STR = '{';
const R_CURLY_STR = '}';
const L_SQUARE_STR = '[';
const R_SQUARE_STR = ']';
const COMMA_STR = ',';
const DOUBLE_QUOTE_STR = '"';
// Whitespace
const SPACE = ' ';
const RETURN = '\n';
const CARRIAGE_RETURN = '\r\n';
const TAB = '\t';
const token_set = new Set();
token_set.add(COLON_STR);
token_set.add(L_CURLY_STR);
token_set.add(R_CURLY_STR);
token_set.add(L_SQUARE_STR);
token_set.add(R_SQUARE_STR);
token_set.add(COMMA_STR);
token_set.add(SPACE);
token_set.add(RETURN);
token_set.add(CARRIAGE_RETURN);
/**
* Lex Token
* {
* type:
* COLON, L_CURLY, R_CURLY, L_SQUARE, R_SQUARE, COMMA,
* BOOLEAN_LITERAL, NUMBER_LITERAL, STRING_LITERAL, NULL_LITERAL
* value?: string
* }
*/
function tokenize(text) {
const tokens = [];
let ptr = 0, length = text.length, walk = -1;
while (ptr < length) {
switch (text[ptr]) {
case L_CURLY_STR:
tokens.push({ type: L_CURLY });
break;
case R_CURLY_STR:
tokens.push({ type: R_CURLY });
break;
case L_SQUARE_STR:
tokens.push({ type: L_SQUARE });
break;
case R_SQUARE_STR:
tokens.push({ type: R_SQUARE });
break;
case COLON_STR:
tokens.push({ type: COLON });
break;
case COMMA_STR:
tokens.push({ type: COMMA });
break;
case SPACE:
case TAB:
case RETURN:
case CARRIAGE_RETURN: // ignore
break;
default:
let literal, literal_type;
walk = ptr;
if (text[walk] === DOUBLE_QUOTE_STR) {
while (text[++walk] !== DOUBLE_QUOTE_STR);
literal = text.slice(ptr, walk + 1);
if (is_string(literal)) {
literal_type = STRING_LITERAL;
}
ptr = walk;
} else {
while (!token_set.has(text[++walk]));
literal = text.slice(ptr, walk);
literal_type = undefined;
if (is_number(literal) || is_float(literal)) {
literal_type = NUMBER_LITERAL;
} else if (is_boolean(literal)) {
literal_type = BOOLEAN_LITERAL;
} else if (is_null(literal)) {
literal_type = NULL_LITERAL;
}
ptr = walk - 1;
}
tokens.push({
type: literal_type,
value: literal
});
break;
}
ptr++;
}
return tokens;
}
function lex_analyze(tokens) {
// lex rules for object
// L_CURLY -> STRING_LITERAL, R_CURLY
// STRING_LITERAL -> COLON, COMMA, R_CURLY
// literal -> COMMA, R_CURLY
// COLON -> literal, L_CURLY, L_SQUARE
// COMMA -> STRING_LITERAL
function lex_object(tokens, token_index) {
let next_token = tokens[token_index + 1];
switch (tokens[token_index].type) {
case L_CURLY:
switch (next_token.type) {
case STRING_LITERAL: lex_object(tokens, token_index + 1);
break;
case R_CURLY: return;
default: throw new Error('lex error');
}
break;
case STRING_LITERAL:
switch (next_token.type) {
case COLON: lex_object(tokens, token_index + 1);
break;
case COMMA: lex_object(tokens, token_index + 1);
break;
case R_CURLY: return;
default: throw new Error('lex error');
}
break;
case NUMBER_LITERAL:
case BOOLEAN_LITERAL:
case NULL_LITERAL:
switch (next_token.type) {
case COMMA: lex_object(tokens, token_index + 1);
break;
case R_CURLY: return;
default: throw new Error('lex error');
}
break;
case COLON:
switch (next_token.type) {
case L_CURLY:
case STRING_LITERAL:
case NUMBER_LITERAL:
case BOOLEAN_LITERAL:
case NULL_LITERAL: lex_object(tokens, token_index + 1);
break;
case L_SQUARE: lex_array(tokens, token_index + 1);
break;
default: throw new Error('lex error');
}
break;
case COMMA:
switch (next_token.type) {
case STRING_LITERAL: lex_object(tokens, token_index + 1);
break;
default: throw new Error('lex error');
}
break;
default: throw new Error('lex error');
}
}
// lex rules for array
// L_SQUARE -> literal, L_CURLY, L_SQUARE, R_SQUARE
// literal -> COMMA, R_SQUARE
// COMMA -> literal, L_CURLY, L_SQUARE
function lex_array(tokens, token_index) {
let next_token = tokens[token_index + 1];
switch (tokens[token_index].type) {
case L_SQUARE:
switch (next_token.type) {
case STRING_LITERAL:
case NUMBER_LITERAL:
case BOOLEAN_LITERAL:
case NULL_LITERAL: lex_array(tokens, token_index + 1);
break;
case L_CURLY: lex_object(tokens, token_index + 1);
break;
case L_SQUARE: lex_array(tokens, token_index + 1);
break;
case R_SQUARE: return;
default: throw new Error('lex error');
}
break;
case STRING_LITERAL:
case NUMBER_LITERAL:
case BOOLEAN_LITERAL:
case NULL_LITERAL:
switch (next_token.type) {
case COMMA: lex_array(tokens, token_index + 1);
break;
case R_SQUARE: return;
default: throw new Error('lex error');
}
break;
case COMMA:
switch (next_token.type) {
case STRING_LITERAL:
case NUMBER_LITERAL:
case BOOLEAN_LITERAL:
case NULL_LITERAL: lex_array(tokens, token_index + 1);
break;
case L_CURLY: lex_object(tokens, token_index + 1);
break;
case L_SQUARE: lex_array(tokens, token_index + 1);
break;
default: throw new Error('lex error');
}
break;
default: throw new Error('lex error');
}
}
// start with L_CURLY | L_SQUARE
switch(head(tokens).type) {
case L_CURLY: lex_object(tokens, 0);
break;
case L_SQUARE: lex_array(tokens, 0);
break;
default: throw new Error('lex error');
}
return tokens;
}
// PRIMITIVE values
const TRUE = 'true';
const FALSE = 'false';
// TYPES
const OBJECT = 'object';
const ARRAY = 'array';
/**
* AST Nodes
* Value:
* | Object
* | Array
* | { type: BOOLEAN_LITERAL, value: string }
* | { type: NUMBER_LITERAL , value: string }
* | { type: STRING_LITERAL , value: string }
* | { type: NULL_LITERAL , value: string }
*
* Array:
* {
* type: 'array',
* children: Value[]
* }
*
* Entry:
* {
* key: string
* value: Value[]
* }
* Object:
* {
* type: 'object',
* children: Entry[]
* }
*/
function parseObject(tokens, token_index) {
const children = [];
let ptr = token_index;
while (tokens[ptr].type !== R_CURLY) {
switch (tokens[ptr].type) {
case COLON: {
const current_key = tokens[ptr - 1].value;
const [value, index] = parseValue(tokens, ptr + 1);
children.push({
key : current_key,
value
});
ptr = index;
}
break;
case COMMA: // ignore
break;
default:
break;
}
ptr++;
}
return [
{
type: OBJECT,
children
},
ptr
]
}
function parseArray(tokens, token_index) {
const children = [];
let ptr = token_index;
while (tokens[ptr].type !== R_SQUARE) {
switch (tokens[ptr].type) {
case COMMA: // ignore
break;
default:
const [value, index] = parseValue(tokens, ptr);
children.push(value);
ptr = index;
break;
}
ptr++;
}
return [
{
type: ARRAY,
children
},
ptr
]
}
function parseValue(tokens, token_index) {
const value_token = tokens[token_index];
switch (value_token.type) {
case L_CURLY: return parseObject(tokens, token_index + 1);
case L_SQUARE: return parseArray(tokens, token_index + 1);
case NULL_LITERAL:
case BOOLEAN_LITERAL:
case NUMBER_LITERAL:
case STRING_LITERAL:
return [
value_token,
token_index
]
default: throw Error('parse error');
}
}
// helpers
function head(xs) { return xs && xs[0] }
function last(xs) { return xs && xs[xs.length - 1] }
function is_string(s) {
if (s === undefined || s.length === 0) return false;
return (s[0] === '"' && s[s.length - 1] === '"');
}
function is_number(n) { return !isNaN(parseInt(n, 10)) }
function is_float(n) {
if (!n || n.indexOf('.') === -1) return false;
return is_number(n)
}
function is_boolean(n) { return n === 'true' || n === 'false' }
function is_null(n) { return n === 'null' }
function stripQuotes(s) {
if (typeof s !== 'string') return s;
let start = 0, end = s.length;
if (head(s) === DOUBLE_QUOTE_STR) start++;
if (last(s) === DOUBLE_QUOTE_STR) end--;
return s.substring(start, end)
}
function parse(tokens, token_index) {
switch (head(tokens).type) {
case L_CURLY: return parseObject(tokens, token_index + 1);
case L_SQUARE: return parseArray(tokens, token_index + 1);
default: throw new Error('parse error');
}
}
function walker(ast_node, extraction_fn) {
switch (ast_node.type) {
case OBJECT:
return ast_node.children.reduce((acc, child) => {
const { key, value } = child;
acc[stripQuotes(key)] = walker(value, extraction_fn);
return acc;
}, {});
case ARRAY:
return ast_node.children.reduce((acc, child) => {
acc.push(walker(child, extraction_fn));
return acc;
}, []);
case NULL_LITERAL:
case NUMBER_LITERAL:
case BOOLEAN_LITERAL:
case STRING_LITERAL:
return extraction_fn(ast_node);
}
}
function extract_value(ast_node) {
const { type, value } = ast_node;
switch (type) {
case NUMBER_LITERAL:
if (is_float(value)) return parseFloat(value);
if (is_number(value))return parseInt(value, 10);
break;
case STRING_LITERAL: return stripQuotes(value);
case BOOLEAN_LITERAL: return value === TRUE;
case NULL_LITERAL: return null;
}
}
function json_parse(json_string) {
const tokens = tokenize(json_string);
const lexed_tokens = lex_analyze(tokens);
const [ast, _] = parse(lexed_tokens, 0);
return walker(ast, extract_value);
}
module.exports = {
json_parse
}
const { json_parse } = require('./json_parser.js');
describe("JSON parser accept valid cases", () => {
it("should parse valid JSON [empty object]", () => {
const json = {};
const json_string = JSON.stringify(json);
expect(JSON.stringify(json_parse(json_string))).toBe(json_string);
});
it("should parse valid JSON [empty array]", () => {
const json = [];
const json_string = JSON.stringify(json);
expect(JSON.stringify(json_parse(json_string))).toBe(json_string);
});
it("should parse valid JSON [nested object]", () => {
const json = {
"glossary": {
"title": "example glossary",
"GlossDiv": {
"title": "S",
"GlossList": {
"GlossEntry": {
"ID": "SGML",
"SortAs": "SGML",
"GlossTerm": "Standard Generalized Markup Language",
"Acronym": "SGML",
"Abbrev": "ISO 8879:1986",
"GlossDef": {
"para": "A meta-markup language, used to create markup languages such as DocBook.",
"GlossSeeAlso": ["GML", "XML"]
},
"GlossSee": "markup"
}
}
}
}
};
const json_string = JSON.stringify(json);
expect(JSON.stringify(json_parse(json_string))).toBe(json_string);
});
it("should parse valid JSON [nested objects]", () => {
const json = {
k1: {
k2: {
k3: {
k4: {
k5: true,
k9: 123
},
k6: 1.1123,
k10: false
},
k7: "string"
},
k8: null
}
};
const json_string = JSON.stringify(json);
expect(JSON.stringify(json_parse(json_string))).toBe(json_string);
});
it("should parse valid JSON [nested arrays]", () => {
const json = [1, [2, [3, [4], 5], 6], 7];
const json_string = JSON.stringify(json);
expect(JSON.stringify(json_parse(json_string))).toBe(json_string);
});
});
describe("JSON parser reject invalid cases", () => {
it("Reject {{{{}}}}", () => {
const json_string = "{{{{}}}}";
expect(() => json_parse(json_string)).toThrowError(new Error('lex error'));
});
it("Reject [,]", () => {
const json_string = "[,]";
expect(() => json_parse(json_string)).toThrowError(new Error('lex error'));
});
it("Reject [1, 2, ]", () => {
const json_string = "[1, 2, ]";
expect(() => json_parse(json_string)).toThrowError(new Error('lex error'));
});
it("Reject [{]}", () => {
const json_string = "[{]}";
expect(() => json_parse(json_string)).toThrowError(new Error('lex error'));
});
it("Reject {1, 2, 3, 4}", () => {
const json_string = "{1, 2, 3, 4}";
expect(() => json_parse(json_string)).toThrowError(new Error('lex error'));
});
it("Reject {a:", () => {
const json_string = "{a:";
expect(() => json_parse(json_string)).toThrowError(new Error('lex error'));
});
it("Reject {a:b}", () => {
const json_string = "{a:b}";
expect(() => json_parse(json_string)).toThrowError(new Error('lex error'));
});
it('Reject {"a":"b",}', () => {
const json_string = '{"a":"b",}';
expect(() => json_parse(json_string)).toThrowError(new Error('lex error'));
});
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment