JSON parser
// TOKEN types | |
const COLON = 'COLON'; | |
const L_CURLY = 'L_CURLY'; | |
const R_CURLY = 'R_CURLY'; | |
const L_SQUARE = 'L_SQUARE'; | |
const R_SQUARE = 'R_SQUARE'; | |
const COMMA = 'COMMA'; | |
const BOOLEAN_LITERAL = 'BOOLEAN_LITERAL'; | |
const NUMBER_LITERAL = 'NUMBER_LITERAL'; | |
const STRING_LITERAL = 'STRING_LITERAL'; | |
const NULL_LITERAL = 'NULL_LITERAL'; | |
// TOKEN raw string | |
const COLON_STR = ':'; | |
const L_CURLY_STR = '{'; | |
const R_CURLY_STR = '}'; | |
const L_SQUARE_STR = '['; | |
const R_SQUARE_STR = ']'; | |
const COMMA_STR = ','; | |
const DOUBLE_QUOTE_STR = '"'; | |
// Whitespace | |
const SPACE = ' '; | |
const RETURN = '\n'; | |
const CARRIAGE_RETURN = '\r\n'; | |
const TAB = '\t'; | |
const token_set = new Set(); | |
token_set.add(COLON_STR); | |
token_set.add(L_CURLY_STR); | |
token_set.add(R_CURLY_STR); | |
token_set.add(L_SQUARE_STR); | |
token_set.add(R_SQUARE_STR); | |
token_set.add(COMMA_STR); | |
token_set.add(SPACE); | |
token_set.add(RETURN); | |
token_set.add(CARRIAGE_RETURN); | |
/** | |
* Lex Token | |
* { | |
* type: | |
* COLON, L_CURLY, R_CURLY, L_SQUARE, R_SQUARE, COMMA, | |
* BOOLEAN_LITERAL, NUMBER_LITERAL, STRING_LITERAL, NULL_LITERAL | |
* value?: string | |
* } | |
*/ | |
function tokenize(text) { | |
const tokens = []; | |
let ptr = 0, length = text.length, walk = -1; | |
while (ptr < length) { | |
switch (text[ptr]) { | |
case L_CURLY_STR: | |
tokens.push({ type: L_CURLY }); | |
break; | |
case R_CURLY_STR: | |
tokens.push({ type: R_CURLY }); | |
break; | |
case L_SQUARE_STR: | |
tokens.push({ type: L_SQUARE }); | |
break; | |
case R_SQUARE_STR: | |
tokens.push({ type: R_SQUARE }); | |
break; | |
case COLON_STR: | |
tokens.push({ type: COLON }); | |
break; | |
case COMMA_STR: | |
tokens.push({ type: COMMA }); | |
break; | |
case SPACE: | |
case TAB: | |
case RETURN: | |
case CARRIAGE_RETURN: // ignore | |
break; | |
default: | |
let literal, literal_type; | |
walk = ptr; | |
if (text[walk] === DOUBLE_QUOTE_STR) { | |
while (text[++walk] !== DOUBLE_QUOTE_STR); | |
literal = text.slice(ptr, walk + 1); | |
if (is_string(literal)) { | |
literal_type = STRING_LITERAL; | |
} | |
ptr = walk; | |
} else { | |
while (!token_set.has(text[++walk])); | |
literal = text.slice(ptr, walk); | |
literal_type = undefined; | |
if (is_number(literal) || is_float(literal)) { | |
literal_type = NUMBER_LITERAL; | |
} else if (is_boolean(literal)) { | |
literal_type = BOOLEAN_LITERAL; | |
} else if (is_null(literal)) { | |
literal_type = NULL_LITERAL; | |
} | |
ptr = walk - 1; | |
} | |
tokens.push({ | |
type: literal_type, | |
value: literal | |
}); | |
break; | |
} | |
ptr++; | |
} | |
return tokens; | |
} | |
function lex_analyze(tokens) { | |
// lex rules for object | |
// L_CURLY -> STRING_LITERAL, R_CURLY | |
// STRING_LITERAL -> COLON, COMMA, R_CURLY | |
// literal -> COMMA, R_CURLY | |
// COLON -> literal, L_CURLY, L_SQUARE | |
// COMMA -> STRING_LITERAL | |
function lex_object(tokens, token_index) { | |
let next_token = tokens[token_index + 1]; | |
switch (tokens[token_index].type) { | |
case L_CURLY: | |
switch (next_token.type) { | |
case STRING_LITERAL: lex_object(tokens, token_index + 1); | |
break; | |
case R_CURLY: return; | |
default: throw new Error('lex error'); | |
} | |
break; | |
case STRING_LITERAL: | |
switch (next_token.type) { | |
case COLON: lex_object(tokens, token_index + 1); | |
break; | |
case COMMA: lex_object(tokens, token_index + 1); | |
break; | |
case R_CURLY: return; | |
default: throw new Error('lex error'); | |
} | |
break; | |
case NUMBER_LITERAL: | |
case BOOLEAN_LITERAL: | |
case NULL_LITERAL: | |
switch (next_token.type) { | |
case COMMA: lex_object(tokens, token_index + 1); | |
break; | |
case R_CURLY: return; | |
default: throw new Error('lex error'); | |
} | |
break; | |
case COLON: | |
switch (next_token.type) { | |
case L_CURLY: | |
case STRING_LITERAL: | |
case NUMBER_LITERAL: | |
case BOOLEAN_LITERAL: | |
case NULL_LITERAL: lex_object(tokens, token_index + 1); | |
break; | |
case L_SQUARE: lex_array(tokens, token_index + 1); | |
break; | |
default: throw new Error('lex error'); | |
} | |
break; | |
case COMMA: | |
switch (next_token.type) { | |
case STRING_LITERAL: lex_object(tokens, token_index + 1); | |
break; | |
default: throw new Error('lex error'); | |
} | |
break; | |
default: throw new Error('lex error'); | |
} | |
} | |
// lex rules for array | |
// L_SQUARE -> literal, L_CURLY, L_SQUARE, R_SQUARE | |
// literal -> COMMA, R_SQUARE | |
// COMMA -> literal, L_CURLY, L_SQUARE | |
function lex_array(tokens, token_index) { | |
let next_token = tokens[token_index + 1]; | |
switch (tokens[token_index].type) { | |
case L_SQUARE: | |
switch (next_token.type) { | |
case STRING_LITERAL: | |
case NUMBER_LITERAL: | |
case BOOLEAN_LITERAL: | |
case NULL_LITERAL: lex_array(tokens, token_index + 1); | |
break; | |
case L_CURLY: lex_object(tokens, token_index + 1); | |
break; | |
case L_SQUARE: lex_array(tokens, token_index + 1); | |
break; | |
case R_SQUARE: return; | |
default: throw new Error('lex error'); | |
} | |
break; | |
case STRING_LITERAL: | |
case NUMBER_LITERAL: | |
case BOOLEAN_LITERAL: | |
case NULL_LITERAL: | |
switch (next_token.type) { | |
case COMMA: lex_array(tokens, token_index + 1); | |
break; | |
case R_SQUARE: return; | |
default: throw new Error('lex error'); | |
} | |
break; | |
case COMMA: | |
switch (next_token.type) { | |
case STRING_LITERAL: | |
case NUMBER_LITERAL: | |
case BOOLEAN_LITERAL: | |
case NULL_LITERAL: lex_array(tokens, token_index + 1); | |
break; | |
case L_CURLY: lex_object(tokens, token_index + 1); | |
break; | |
case L_SQUARE: lex_array(tokens, token_index + 1); | |
break; | |
default: throw new Error('lex error'); | |
} | |
break; | |
default: throw new Error('lex error'); | |
} | |
} | |
// start with L_CURLY | L_SQUARE | |
switch(head(tokens).type) { | |
case L_CURLY: lex_object(tokens, 0); | |
break; | |
case L_SQUARE: lex_array(tokens, 0); | |
break; | |
default: throw new Error('lex error'); | |
} | |
return tokens; | |
} | |
// PRIMITIVE values | |
const TRUE = 'true'; | |
const FALSE = 'false'; | |
// TYPES | |
const OBJECT = 'object'; | |
const ARRAY = 'array'; | |
/** | |
* AST Nodes | |
* Value: | |
* | Object | |
* | Array | |
* | { type: BOOLEAN_LITERAL, value: string } | |
* | { type: NUMBER_LITERAL , value: string } | |
* | { type: STRING_LITERAL , value: string } | |
* | { type: NULL_LITERAL , value: string } | |
* | |
* Array: | |
* { | |
* type: 'array', | |
* children: Value[] | |
* } | |
* | |
* Entry: | |
* { | |
* key: string | |
* value: Value[] | |
* } | |
* Object: | |
* { | |
* type: 'object', | |
* children: Entry[] | |
* } | |
*/ | |
function parseObject(tokens, token_index) { | |
const children = []; | |
let ptr = token_index; | |
while (tokens[ptr].type !== R_CURLY) { | |
switch (tokens[ptr].type) { | |
case COLON: { | |
const current_key = tokens[ptr - 1].value; | |
const [value, index] = parseValue(tokens, ptr + 1); | |
children.push({ | |
key : current_key, | |
value | |
}); | |
ptr = index; | |
} | |
break; | |
case COMMA: // ignore | |
break; | |
default: | |
break; | |
} | |
ptr++; | |
} | |
return [ | |
{ | |
type: OBJECT, | |
children | |
}, | |
ptr | |
] | |
} | |
function parseArray(tokens, token_index) { | |
const children = []; | |
let ptr = token_index; | |
while (tokens[ptr].type !== R_SQUARE) { | |
switch (tokens[ptr].type) { | |
case COMMA: // ignore | |
break; | |
default: | |
const [value, index] = parseValue(tokens, ptr); | |
children.push(value); | |
ptr = index; | |
break; | |
} | |
ptr++; | |
} | |
return [ | |
{ | |
type: ARRAY, | |
children | |
}, | |
ptr | |
] | |
} | |
function parseValue(tokens, token_index) { | |
const value_token = tokens[token_index]; | |
switch (value_token.type) { | |
case L_CURLY: return parseObject(tokens, token_index + 1); | |
case L_SQUARE: return parseArray(tokens, token_index + 1); | |
case NULL_LITERAL: | |
case BOOLEAN_LITERAL: | |
case NUMBER_LITERAL: | |
case STRING_LITERAL: | |
return [ | |
value_token, | |
token_index | |
] | |
default: throw Error('parse error'); | |
} | |
} | |
// helpers | |
function head(xs) { return xs && xs[0] } | |
function last(xs) { return xs && xs[xs.length - 1] } | |
function is_string(s) { | |
if (s === undefined || s.length === 0) return false; | |
return (s[0] === '"' && s[s.length - 1] === '"'); | |
} | |
function is_number(n) { return !isNaN(parseInt(n, 10)) } | |
function is_float(n) { | |
if (!n || n.indexOf('.') === -1) return false; | |
return is_number(n) | |
} | |
function is_boolean(n) { return n === 'true' || n === 'false' } | |
function is_null(n) { return n === 'null' } | |
function stripQuotes(s) { | |
if (typeof s !== 'string') return s; | |
let start = 0, end = s.length; | |
if (head(s) === DOUBLE_QUOTE_STR) start++; | |
if (last(s) === DOUBLE_QUOTE_STR) end--; | |
return s.substring(start, end) | |
} | |
function parse(tokens, token_index) { | |
switch (head(tokens).type) { | |
case L_CURLY: return parseObject(tokens, token_index + 1); | |
case L_SQUARE: return parseArray(tokens, token_index + 1); | |
default: throw new Error('parse error'); | |
} | |
} | |
function walker(ast_node, extraction_fn) { | |
switch (ast_node.type) { | |
case OBJECT: | |
return ast_node.children.reduce((acc, child) => { | |
const { key, value } = child; | |
acc[stripQuotes(key)] = walker(value, extraction_fn); | |
return acc; | |
}, {}); | |
case ARRAY: | |
return ast_node.children.reduce((acc, child) => { | |
acc.push(walker(child, extraction_fn)); | |
return acc; | |
}, []); | |
case NULL_LITERAL: | |
case NUMBER_LITERAL: | |
case BOOLEAN_LITERAL: | |
case STRING_LITERAL: | |
return extraction_fn(ast_node); | |
} | |
} | |
function extract_value(ast_node) { | |
const { type, value } = ast_node; | |
switch (type) { | |
case NUMBER_LITERAL: | |
if (is_float(value)) return parseFloat(value); | |
if (is_number(value))return parseInt(value, 10); | |
break; | |
case STRING_LITERAL: return stripQuotes(value); | |
case BOOLEAN_LITERAL: return value === TRUE; | |
case NULL_LITERAL: return null; | |
} | |
} | |
function json_parse(json_string) { | |
const tokens = tokenize(json_string); | |
const lexed_tokens = lex_analyze(tokens); | |
const [ast, _] = parse(lexed_tokens, 0); | |
return walker(ast, extract_value); | |
} | |
module.exports = { | |
json_parse | |
} |
const { json_parse } = require('./json_parser.js'); | |
describe("JSON parser accept valid cases", () => { | |
it("should parse valid JSON [empty object]", () => { | |
const json = {}; | |
const json_string = JSON.stringify(json); | |
expect(JSON.stringify(json_parse(json_string))).toBe(json_string); | |
}); | |
it("should parse valid JSON [empty array]", () => { | |
const json = []; | |
const json_string = JSON.stringify(json); | |
expect(JSON.stringify(json_parse(json_string))).toBe(json_string); | |
}); | |
it("should parse valid JSON [nested object]", () => { | |
const json = { | |
"glossary": { | |
"title": "example glossary", | |
"GlossDiv": { | |
"title": "S", | |
"GlossList": { | |
"GlossEntry": { | |
"ID": "SGML", | |
"SortAs": "SGML", | |
"GlossTerm": "Standard Generalized Markup Language", | |
"Acronym": "SGML", | |
"Abbrev": "ISO 8879:1986", | |
"GlossDef": { | |
"para": "A meta-markup language, used to create markup languages such as DocBook.", | |
"GlossSeeAlso": ["GML", "XML"] | |
}, | |
"GlossSee": "markup" | |
} | |
} | |
} | |
} | |
}; | |
const json_string = JSON.stringify(json); | |
expect(JSON.stringify(json_parse(json_string))).toBe(json_string); | |
}); | |
it("should parse valid JSON [nested objects]", () => { | |
const json = { | |
k1: { | |
k2: { | |
k3: { | |
k4: { | |
k5: true, | |
k9: 123 | |
}, | |
k6: 1.1123, | |
k10: false | |
}, | |
k7: "string" | |
}, | |
k8: null | |
} | |
}; | |
const json_string = JSON.stringify(json); | |
expect(JSON.stringify(json_parse(json_string))).toBe(json_string); | |
}); | |
it("should parse valid JSON [nested arrays]", () => { | |
const json = [1, [2, [3, [4], 5], 6], 7]; | |
const json_string = JSON.stringify(json); | |
expect(JSON.stringify(json_parse(json_string))).toBe(json_string); | |
}); | |
}); | |
describe("JSON parser reject invalid cases", () => { | |
it("Reject {{{{}}}}", () => { | |
const json_string = "{{{{}}}}"; | |
expect(() => json_parse(json_string)).toThrowError(new Error('lex error')); | |
}); | |
it("Reject [,]", () => { | |
const json_string = "[,]"; | |
expect(() => json_parse(json_string)).toThrowError(new Error('lex error')); | |
}); | |
it("Reject [1, 2, ]", () => { | |
const json_string = "[1, 2, ]"; | |
expect(() => json_parse(json_string)).toThrowError(new Error('lex error')); | |
}); | |
it("Reject [{]}", () => { | |
const json_string = "[{]}"; | |
expect(() => json_parse(json_string)).toThrowError(new Error('lex error')); | |
}); | |
it("Reject {1, 2, 3, 4}", () => { | |
const json_string = "{1, 2, 3, 4}"; | |
expect(() => json_parse(json_string)).toThrowError(new Error('lex error')); | |
}); | |
it("Reject {a:", () => { | |
const json_string = "{a:"; | |
expect(() => json_parse(json_string)).toThrowError(new Error('lex error')); | |
}); | |
it("Reject {a:b}", () => { | |
const json_string = "{a:b}"; | |
expect(() => json_parse(json_string)).toThrowError(new Error('lex error')); | |
}); | |
it('Reject {"a":"b",}', () => { | |
const json_string = '{"a":"b",}'; | |
expect(() => json_parse(json_string)).toThrowError(new Error('lex error')); | |
}); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment