Last active
July 2, 2020 17:16
-
-
Save melwyn95/7d4bc7a71e31c51ede4cbb2ce1272d1e to your computer and use it in GitHub Desktop.
JSON parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// TOKEN types | |
const COLON = 'COLON'; | |
const L_CURLY = 'L_CURLY'; | |
const R_CURLY = 'R_CURLY'; | |
const L_SQUARE = 'L_SQUARE'; | |
const R_SQUARE = 'R_SQUARE'; | |
const COMMA = 'COMMA'; | |
const BOOLEAN_LITERAL = 'BOOLEAN_LITERAL'; | |
const NUMBER_LITERAL = 'NUMBER_LITERAL'; | |
const STRING_LITERAL = 'STRING_LITERAL'; | |
const NULL_LITERAL = 'NULL_LITERAL'; | |
// TOKEN raw string | |
const COLON_STR = ':'; | |
const L_CURLY_STR = '{'; | |
const R_CURLY_STR = '}'; | |
const L_SQUARE_STR = '['; | |
const R_SQUARE_STR = ']'; | |
const COMMA_STR = ','; | |
const DOUBLE_QUOTE_STR = '"'; | |
// Whitespace | |
const SPACE = ' '; | |
const RETURN = '\n'; | |
const CARRIAGE_RETURN = '\r\n'; | |
const TAB = '\t'; | |
const token_set = new Set(); | |
token_set.add(COLON_STR); | |
token_set.add(L_CURLY_STR); | |
token_set.add(R_CURLY_STR); | |
token_set.add(L_SQUARE_STR); | |
token_set.add(R_SQUARE_STR); | |
token_set.add(COMMA_STR); | |
token_set.add(SPACE); | |
token_set.add(RETURN); | |
token_set.add(CARRIAGE_RETURN); | |
/** | |
* Lex Token | |
* { | |
* type: | |
* COLON, L_CURLY, R_CURLY, L_SQUARE, R_SQUARE, COMMA, | |
* BOOLEAN_LITERAL, NUMBER_LITERAL, STRING_LITERAL, NULL_LITERAL | |
* value?: string | |
* } | |
*/ | |
function tokenize(text) { | |
const tokens = []; | |
let ptr = 0, length = text.length, walk = -1; | |
while (ptr < length) { | |
switch (text[ptr]) { | |
case L_CURLY_STR: | |
tokens.push({ type: L_CURLY }); | |
break; | |
case R_CURLY_STR: | |
tokens.push({ type: R_CURLY }); | |
break; | |
case L_SQUARE_STR: | |
tokens.push({ type: L_SQUARE }); | |
break; | |
case R_SQUARE_STR: | |
tokens.push({ type: R_SQUARE }); | |
break; | |
case COLON_STR: | |
tokens.push({ type: COLON }); | |
break; | |
case COMMA_STR: | |
tokens.push({ type: COMMA }); | |
break; | |
case SPACE: | |
case TAB: | |
case RETURN: | |
case CARRIAGE_RETURN: // ignore | |
break; | |
default: | |
let literal, literal_type; | |
walk = ptr; | |
if (text[walk] === DOUBLE_QUOTE_STR) { | |
while (text[++walk] !== DOUBLE_QUOTE_STR); | |
literal = text.slice(ptr, walk + 1); | |
if (is_string(literal)) { | |
literal_type = STRING_LITERAL; | |
} | |
ptr = walk; | |
} else { | |
while (!token_set.has(text[++walk])); | |
literal = text.slice(ptr, walk); | |
literal_type = undefined; | |
if (is_number(literal) || is_float(literal)) { | |
literal_type = NUMBER_LITERAL; | |
} else if (is_boolean(literal)) { | |
literal_type = BOOLEAN_LITERAL; | |
} else if (is_null(literal)) { | |
literal_type = NULL_LITERAL; | |
} | |
ptr = walk - 1; | |
} | |
tokens.push({ | |
type: literal_type, | |
value: literal | |
}); | |
break; | |
} | |
ptr++; | |
} | |
return tokens; | |
} | |
function lex_analyze(tokens) { | |
// lex rules for object | |
// L_CURLY -> STRING_LITERAL, R_CURLY | |
// STRING_LITERAL -> COLON, COMMA, R_CURLY | |
// literal -> COMMA, R_CURLY | |
// COLON -> literal, L_CURLY, L_SQUARE | |
// COMMA -> STRING_LITERAL | |
function lex_object(tokens, token_index) { | |
let next_token = tokens[token_index + 1]; | |
switch (tokens[token_index].type) { | |
case L_CURLY: | |
switch (next_token.type) { | |
case STRING_LITERAL: lex_object(tokens, token_index + 1); | |
break; | |
case R_CURLY: return; | |
default: throw new Error('lex error'); | |
} | |
break; | |
case STRING_LITERAL: | |
switch (next_token.type) { | |
case COLON: lex_object(tokens, token_index + 1); | |
break; | |
case COMMA: lex_object(tokens, token_index + 1); | |
break; | |
case R_CURLY: return; | |
default: throw new Error('lex error'); | |
} | |
break; | |
case NUMBER_LITERAL: | |
case BOOLEAN_LITERAL: | |
case NULL_LITERAL: | |
switch (next_token.type) { | |
case COMMA: lex_object(tokens, token_index + 1); | |
break; | |
case R_CURLY: return; | |
default: throw new Error('lex error'); | |
} | |
break; | |
case COLON: | |
switch (next_token.type) { | |
case L_CURLY: | |
case STRING_LITERAL: | |
case NUMBER_LITERAL: | |
case BOOLEAN_LITERAL: | |
case NULL_LITERAL: lex_object(tokens, token_index + 1); | |
break; | |
case L_SQUARE: lex_array(tokens, token_index + 1); | |
break; | |
default: throw new Error('lex error'); | |
} | |
break; | |
case COMMA: | |
switch (next_token.type) { | |
case STRING_LITERAL: lex_object(tokens, token_index + 1); | |
break; | |
default: throw new Error('lex error'); | |
} | |
break; | |
default: throw new Error('lex error'); | |
} | |
} | |
// lex rules for array | |
// L_SQUARE -> literal, L_CURLY, L_SQUARE, R_SQUARE | |
// literal -> COMMA, R_SQUARE | |
// COMMA -> literal, L_CURLY, L_SQUARE | |
function lex_array(tokens, token_index) { | |
let next_token = tokens[token_index + 1]; | |
switch (tokens[token_index].type) { | |
case L_SQUARE: | |
switch (next_token.type) { | |
case STRING_LITERAL: | |
case NUMBER_LITERAL: | |
case BOOLEAN_LITERAL: | |
case NULL_LITERAL: lex_array(tokens, token_index + 1); | |
break; | |
case L_CURLY: lex_object(tokens, token_index + 1); | |
break; | |
case L_SQUARE: lex_array(tokens, token_index + 1); | |
break; | |
case R_SQUARE: return; | |
default: throw new Error('lex error'); | |
} | |
break; | |
case STRING_LITERAL: | |
case NUMBER_LITERAL: | |
case BOOLEAN_LITERAL: | |
case NULL_LITERAL: | |
switch (next_token.type) { | |
case COMMA: lex_array(tokens, token_index + 1); | |
break; | |
case R_SQUARE: return; | |
default: throw new Error('lex error'); | |
} | |
break; | |
case COMMA: | |
switch (next_token.type) { | |
case STRING_LITERAL: | |
case NUMBER_LITERAL: | |
case BOOLEAN_LITERAL: | |
case NULL_LITERAL: lex_array(tokens, token_index + 1); | |
break; | |
case L_CURLY: lex_object(tokens, token_index + 1); | |
break; | |
case L_SQUARE: lex_array(tokens, token_index + 1); | |
break; | |
default: throw new Error('lex error'); | |
} | |
break; | |
default: throw new Error('lex error'); | |
} | |
} | |
// start with L_CURLY | L_SQUARE | |
switch(head(tokens).type) { | |
case L_CURLY: lex_object(tokens, 0); | |
break; | |
case L_SQUARE: lex_array(tokens, 0); | |
break; | |
default: throw new Error('lex error'); | |
} | |
return tokens; | |
} | |
// PRIMITIVE values | |
const TRUE = 'true'; | |
const FALSE = 'false'; | |
// TYPES | |
const OBJECT = 'object'; | |
const ARRAY = 'array'; | |
/** | |
* AST Nodes | |
* Value: | |
* | Object | |
* | Array | |
* | { type: BOOLEAN_LITERAL, value: string } | |
* | { type: NUMBER_LITERAL , value: string } | |
* | { type: STRING_LITERAL , value: string } | |
* | { type: NULL_LITERAL , value: string } | |
* | |
* Array: | |
* { | |
* type: 'array', | |
* children: Value[] | |
* } | |
* | |
* Entry: | |
* { | |
* key: string | |
* value: Value[] | |
* } | |
* Object: | |
* { | |
* type: 'object', | |
* children: Entry[] | |
* } | |
*/ | |
function parseObject(tokens, token_index) { | |
const children = []; | |
let ptr = token_index; | |
while (tokens[ptr].type !== R_CURLY) { | |
switch (tokens[ptr].type) { | |
case COLON: { | |
const current_key = tokens[ptr - 1].value; | |
const [value, index] = parseValue(tokens, ptr + 1); | |
children.push({ | |
key : current_key, | |
value | |
}); | |
ptr = index; | |
} | |
break; | |
case COMMA: // ignore | |
break; | |
default: | |
break; | |
} | |
ptr++; | |
} | |
return [ | |
{ | |
type: OBJECT, | |
children | |
}, | |
ptr | |
] | |
} | |
function parseArray(tokens, token_index) { | |
const children = []; | |
let ptr = token_index; | |
while (tokens[ptr].type !== R_SQUARE) { | |
switch (tokens[ptr].type) { | |
case COMMA: // ignore | |
break; | |
default: | |
const [value, index] = parseValue(tokens, ptr); | |
children.push(value); | |
ptr = index; | |
break; | |
} | |
ptr++; | |
} | |
return [ | |
{ | |
type: ARRAY, | |
children | |
}, | |
ptr | |
] | |
} | |
function parseValue(tokens, token_index) { | |
const value_token = tokens[token_index]; | |
switch (value_token.type) { | |
case L_CURLY: return parseObject(tokens, token_index + 1); | |
case L_SQUARE: return parseArray(tokens, token_index + 1); | |
case NULL_LITERAL: | |
case BOOLEAN_LITERAL: | |
case NUMBER_LITERAL: | |
case STRING_LITERAL: | |
return [ | |
value_token, | |
token_index | |
] | |
default: throw Error('parse error'); | |
} | |
} | |
// helpers | |
function head(xs) { return xs && xs[0] } | |
function last(xs) { return xs && xs[xs.length - 1] } | |
function is_string(s) { | |
if (s === undefined || s.length === 0) return false; | |
return (s[0] === '"' && s[s.length - 1] === '"'); | |
} | |
function is_number(n) { return !isNaN(parseInt(n, 10)) } | |
function is_float(n) { | |
if (!n || n.indexOf('.') === -1) return false; | |
return is_number(n) | |
} | |
function is_boolean(n) { return n === 'true' || n === 'false' } | |
function is_null(n) { return n === 'null' } | |
function stripQuotes(s) { | |
if (typeof s !== 'string') return s; | |
let start = 0, end = s.length; | |
if (head(s) === DOUBLE_QUOTE_STR) start++; | |
if (last(s) === DOUBLE_QUOTE_STR) end--; | |
return s.substring(start, end) | |
} | |
function parse(tokens, token_index) { | |
switch (head(tokens).type) { | |
case L_CURLY: return parseObject(tokens, token_index + 1); | |
case L_SQUARE: return parseArray(tokens, token_index + 1); | |
default: throw new Error('parse error'); | |
} | |
} | |
function walker(ast_node, extraction_fn) { | |
switch (ast_node.type) { | |
case OBJECT: | |
return ast_node.children.reduce((acc, child) => { | |
const { key, value } = child; | |
acc[stripQuotes(key)] = walker(value, extraction_fn); | |
return acc; | |
}, {}); | |
case ARRAY: | |
return ast_node.children.reduce((acc, child) => { | |
acc.push(walker(child, extraction_fn)); | |
return acc; | |
}, []); | |
case NULL_LITERAL: | |
case NUMBER_LITERAL: | |
case BOOLEAN_LITERAL: | |
case STRING_LITERAL: | |
return extraction_fn(ast_node); | |
} | |
} | |
function extract_value(ast_node) { | |
const { type, value } = ast_node; | |
switch (type) { | |
case NUMBER_LITERAL: | |
if (is_float(value)) return parseFloat(value); | |
if (is_number(value))return parseInt(value, 10); | |
break; | |
case STRING_LITERAL: return stripQuotes(value); | |
case BOOLEAN_LITERAL: return value === TRUE; | |
case NULL_LITERAL: return null; | |
} | |
} | |
function json_parse(json_string) { | |
const tokens = tokenize(json_string); | |
const lexed_tokens = lex_analyze(tokens); | |
const [ast, _] = parse(lexed_tokens, 0); | |
return walker(ast, extract_value); | |
} | |
module.exports = { | |
json_parse | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const { json_parse } = require('./json_parser.js'); | |
describe("JSON parser accept valid cases", () => { | |
it("should parse valid JSON [empty object]", () => { | |
const json = {}; | |
const json_string = JSON.stringify(json); | |
expect(JSON.stringify(json_parse(json_string))).toBe(json_string); | |
}); | |
it("should parse valid JSON [empty array]", () => { | |
const json = []; | |
const json_string = JSON.stringify(json); | |
expect(JSON.stringify(json_parse(json_string))).toBe(json_string); | |
}); | |
it("should parse valid JSON [nested object]", () => { | |
const json = { | |
"glossary": { | |
"title": "example glossary", | |
"GlossDiv": { | |
"title": "S", | |
"GlossList": { | |
"GlossEntry": { | |
"ID": "SGML", | |
"SortAs": "SGML", | |
"GlossTerm": "Standard Generalized Markup Language", | |
"Acronym": "SGML", | |
"Abbrev": "ISO 8879:1986", | |
"GlossDef": { | |
"para": "A meta-markup language, used to create markup languages such as DocBook.", | |
"GlossSeeAlso": ["GML", "XML"] | |
}, | |
"GlossSee": "markup" | |
} | |
} | |
} | |
} | |
}; | |
const json_string = JSON.stringify(json); | |
expect(JSON.stringify(json_parse(json_string))).toBe(json_string); | |
}); | |
it("should parse valid JSON [nested objects]", () => { | |
const json = { | |
k1: { | |
k2: { | |
k3: { | |
k4: { | |
k5: true, | |
k9: 123 | |
}, | |
k6: 1.1123, | |
k10: false | |
}, | |
k7: "string" | |
}, | |
k8: null | |
} | |
}; | |
const json_string = JSON.stringify(json); | |
expect(JSON.stringify(json_parse(json_string))).toBe(json_string); | |
}); | |
it("should parse valid JSON [nested arrays]", () => { | |
const json = [1, [2, [3, [4], 5], 6], 7]; | |
const json_string = JSON.stringify(json); | |
expect(JSON.stringify(json_parse(json_string))).toBe(json_string); | |
}); | |
}); | |
describe("JSON parser reject invalid cases", () => { | |
it("Reject {{{{}}}}", () => { | |
const json_string = "{{{{}}}}"; | |
expect(() => json_parse(json_string)).toThrowError(new Error('lex error')); | |
}); | |
it("Reject [,]", () => { | |
const json_string = "[,]"; | |
expect(() => json_parse(json_string)).toThrowError(new Error('lex error')); | |
}); | |
it("Reject [1, 2, ]", () => { | |
const json_string = "[1, 2, ]"; | |
expect(() => json_parse(json_string)).toThrowError(new Error('lex error')); | |
}); | |
it("Reject [{]}", () => { | |
const json_string = "[{]}"; | |
expect(() => json_parse(json_string)).toThrowError(new Error('lex error')); | |
}); | |
it("Reject {1, 2, 3, 4}", () => { | |
const json_string = "{1, 2, 3, 4}"; | |
expect(() => json_parse(json_string)).toThrowError(new Error('lex error')); | |
}); | |
it("Reject {a:", () => { | |
const json_string = "{a:"; | |
expect(() => json_parse(json_string)).toThrowError(new Error('lex error')); | |
}); | |
it("Reject {a:b}", () => { | |
const json_string = "{a:b}"; | |
expect(() => json_parse(json_string)).toThrowError(new Error('lex error')); | |
}); | |
it('Reject {"a":"b",}', () => { | |
const json_string = '{"a":"b",}'; | |
expect(() => json_parse(json_string)).toThrowError(new Error('lex error')); | |
}); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment