Skip to content

Instantly share code, notes, and snippets.

@huzecong
Created April 1, 2020 19:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save huzecong/bc65352b95b0d4d27234a7ca1551068d to your computer and use it in GitHub Desktop.
Save huzecong/bc65352b95b0d4d27234a7ca1551068d to your computer and use it in GitHub Desktop.
Load multiple JSON objects from a single string, in linear time.
# There are times when you're supposed to output a JSONL file, but forgot to put newlines between JSON objects.
# The `pickle` package can handle things like that, but `json` would complain that there are extra trailing characters
# and refuse to parse, although it's perfectly capable of doing so. What a stupid design.
# This snippet utilized lower-level APIs in `json` to handle such case.
import json
from typing import Any, List
import tqdm
def skip_to_non_whitespace(contents: str, start: int = 0) -> int:
r"""Return the index of the next character in ``contents`` that is not a whitespace, starting from ``start``."""
whitespace = " \n\t\r"
for idx in range(start, len(contents)):
if contents[idx] not in whitespace:
return idx
return len(contents)
def load_json_objects(contents: str, verbose: bool = False) -> List[Any]:
decoder = json.JSONDecoder(object_hook=None, object_pairs_hook=None)
data = []
with tqdm.tqdm(total=len(contents), disable=(not verbose), desc="Parsing JSON") as progress:
idx = 0
while idx < len(contents):
start = skip_to_non_whitespace(contents, idx)
obj, end = decoder.raw_decode(contents, start)
end = skip_to_non_whitespace(contents, end)
progress.update(end - idx)
idx = end
data.append(obj)
return data
if __name__ == "__main__":
test_string = ' {"1": 2} [1, 2, 3] 4 "wow" '
objects = load_json_objects(test_string)
assert objects == [
{"1": 2},
[1, 2, 3],
4,
"wow",
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment