Created
April 1, 2020 19:22
-
-
Save huzecong/bc65352b95b0d4d27234a7ca1551068d to your computer and use it in GitHub Desktop.
Load multiple JSON objects from a single string, in linear time.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# There are times when you're supposed to output a JSONL file, but forgot to put newlines between JSON objects. | |
# The `pickle` package can handle things like that, but `json` would complain that there are extra trailing characters | |
# and refuse to parse, although it's perfectly capable of doing so. What a stupid design. | |
# This snippet utilized lower-level APIs in `json` to handle such case. | |
import json | |
from typing import Any, List | |
import tqdm | |
def skip_to_non_whitespace(contents: str, start: int = 0) -> int: | |
r"""Return the index of the next character in ``contents`` that is not a whitespace, starting from ``start``.""" | |
whitespace = " \n\t\r" | |
for idx in range(start, len(contents)): | |
if contents[idx] not in whitespace: | |
return idx | |
return len(contents) | |
def load_json_objects(contents: str, verbose: bool = False) -> List[Any]: | |
decoder = json.JSONDecoder(object_hook=None, object_pairs_hook=None) | |
data = [] | |
with tqdm.tqdm(total=len(contents), disable=(not verbose), desc="Parsing JSON") as progress: | |
idx = 0 | |
while idx < len(contents): | |
start = skip_to_non_whitespace(contents, idx) | |
obj, end = decoder.raw_decode(contents, start) | |
end = skip_to_non_whitespace(contents, end) | |
progress.update(end - idx) | |
idx = end | |
data.append(obj) | |
return data | |
if __name__ == "__main__": | |
test_string = ' {"1": 2} [1, 2, 3] 4 "wow" ' | |
objects = load_json_objects(test_string) | |
assert objects == [ | |
{"1": 2}, | |
[1, 2, 3], | |
4, | |
"wow", | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment