Skip to content

Instantly share code, notes, and snippets.

@blueset
Last active April 17, 2022 04:48
Show Gist options
  • Save blueset/78cc54d6da052c74ff105ff80bde025d to your computer and use it in GitHub Desktop.
Save blueset/78cc54d6da052c74ff105ff80bde025d to your computer and use it in GitHub Desktop.
A rough JSON parser in Python
import re
from typing import Any, Tuple, Union
"""A very rough JSON parser.
Implementing the standard outlined in https://www.json.org/json-en.html
Number parsing is handled by Python.
Usage:
>>> data, _ = parse_json('{"key": ["value", -1e20, true, false, null]}')
>>> assert data == {"key": ["value", -1e20, True, False, None]}
"""
whitespace_re = re.compile("^[ \t\n\r]+")
float_re = re.compile(r"-?\d+(?:\.\d+)?(?:[Ee][+-]?\d+)?")
single_char_escape = {"\\\\": "\\", "\\/": "/", '\\"': '"', "\\b": "\b", "\\f": "\f", "\\n": "\n", "\\r": "\r", "\\t": "\t"}
plain_str_content_re = re.compile(r"([^\\\"]|\n\r\t)+")
def leading_whitespaces(data: str) -> int:
match = whitespace_re.match(data)
if not match:
return 0
return len(match[0])
def parse_string(data: str) -> Tuple[str, int]:
assert data[0] == '"'
result = ""
ptr = 1
while ptr < len(data):
if data[ptr] == "\\":
# Escape sequence
if data[ptr + 1] == "u":
# Unicode escape
result += chr(int(data[ptr + 2:ptr + 6], 16))
ptr += 6
else:
# Single character escape
for i in single_char_escape:
if data[ptr:ptr+2] == i:
result += single_char_escape[i]
ptr += 2
break
elif data[ptr] == '"':
# End of string
ptr += 1
break
else:
# Plain string
match = plain_str_content_re.match(data[ptr:])
result += match[0]
ptr += len(match[0])
return result, ptr
def pares_number(s: str) -> Tuple[Union[float, int], int]:
match = float_re.match(s)[0]
if "e" not in match and "E" not in match and "." not in match:
return int(match), len(match)
return float(match), len(match)
def parse_object(data: str) -> Tuple[dict, int]:
assert data[0] == "{"
result = {}
ptr = 1
while ptr < len(data):
ptr += leading_whitespaces(data[ptr:])
# Empty object
if data[ptr] == "}":
ptr += 1
break
key, proc_len = parse_string(data[ptr:])
ptr += proc_len
ptr += leading_whitespaces(data[ptr:])
assert data[ptr] == ":"
ptr += 1
value, proc_len = parse_json(data[ptr:])
ptr += proc_len
ptr += leading_whitespaces(data[ptr:])
result[key] = value
if data[ptr] == "}":
ptr += 1
break
assert data[ptr] == ","
ptr += 1
return result, ptr
def parse_array(data: str) -> Tuple[list, int]:
assert data[0] == "["
result = []
ptr = 1
while ptr < len(data):
ptr += leading_whitespaces(data[ptr:])
# Empty array
if data[ptr] == "]":
ptr += 1
break
value, proc_len = parse_json(data[ptr:])
ptr += proc_len
result.append(value)
ptr += leading_whitespaces(data[ptr:])
if data[ptr] == "]":
ptr += 1
break
assert data[ptr] == ","
ptr += 1
return result, ptr
def parse_json(data: str) -> Tuple[Any, int]:
"""Returns: parsed value and the number of characters consumed."""
ptr = leading_whitespaces(data)
if ptr == len(data):
return None, ptr
if data[ptr] == "{":
val, proc_chr = parse_object(data[ptr:])
return val, ptr + proc_chr
elif data[ptr] == "[":
val, proc_chr = parse_array(data[ptr:])
return val, ptr + proc_chr
elif data[ptr] == '"':
val, proc_chr = parse_string(data[ptr:])
return val, ptr + proc_chr
elif data[ptr:].startswith("true"):
return True, ptr + 4
elif data[ptr:].startswith("false"):
return False, ptr + 5
elif data[ptr:].startswith("null"):
return None, ptr + 4
elif data[ptr] in "-+0123456789":
val, proc_chr = pares_number(data[ptr:])
return val, ptr + proc_chr
else:
raise ValueError(f"Unexpected character: {data[ptr:]}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment