huzecong/load_multiple_json_objects.py

## load_multiple_json_objects.py
# There are times when you're supposed to output a JSONL file, but forgot to put newlines between JSON objects.
# The `pickle` package can handle things like that, but `json` would complain that there are extra trailing characters
# and refuse to parse, although it's perfectly capable of doing so. What a stupid design.
# This snippet utilized lower-level APIs in `json` to handle such case.

import json
from typing import Any, List

import tqdm


def skip_to_non_whitespace(contents: str, start: int = 0) -> int:
    r"""Return the index of the next character in ``contents`` that is not a whitespace, starting from ``start``."""
    whitespace = " \n\t\r"
    for idx in range(start, len(contents)):
        if contents[idx] not in whitespace:
            return idx
    return len(contents)


def load_json_objects(contents: str, verbose: bool = False) -> List[Any]:
    decoder = json.JSONDecoder(object_hook=None, object_pairs_hook=None)
    data = []
    with tqdm.tqdm(total=len(contents), disable=(not verbose), desc="Parsing JSON") as progress:
        idx = 0
        while idx < len(contents):
            start = skip_to_non_whitespace(contents, idx)
            obj, end = decoder.raw_decode(contents, start)
            end = skip_to_non_whitespace(contents, end)
            progress.update(end - idx)
            idx = end
            data.append(obj)
    return data


if __name__ == "__main__":
    test_string = '  {"1": 2}  [1, 2, 3]  4 "wow" '
    objects = load_json_objects(test_string)
    assert objects == [
        {"1": 2},
        [1, 2, 3],
        4,
        "wow",
    ]
	# There are times when you're supposed to output a JSONL file, but forgot to put newlines between JSON objects.
	# The `pickle` package can handle things like that, but `json` would complain that there are extra trailing characters
	# and refuse to parse, although it's perfectly capable of doing so. What a stupid design.
	# This snippet utilized lower-level APIs in `json` to handle such case.

	import json
	from typing import Any, List

	import tqdm


	def skip_to_non_whitespace(contents: str, start: int = 0) -> int:
	r"""Return the index of the next character in ``contents`` that is not a whitespace, starting from ``start``."""
	whitespace = " \n\t\r"
	for idx in range(start, len(contents)):
	if contents[idx] not in whitespace:
	return idx
	return len(contents)


	def load_json_objects(contents: str, verbose: bool = False) -> List[Any]:
	decoder = json.JSONDecoder(object_hook=None, object_pairs_hook=None)
	data = []
	with tqdm.tqdm(total=len(contents), disable=(not verbose), desc="Parsing JSON") as progress:
	idx = 0
	while idx < len(contents):
	start = skip_to_non_whitespace(contents, idx)
	obj, end = decoder.raw_decode(contents, start)
	end = skip_to_non_whitespace(contents, end)
	progress.update(end - idx)
	idx = end
	data.append(obj)
	return data


	if __name__ == "__main__":
	test_string = ' {"1": 2} [1, 2, 3] 4 "wow" '
	objects = load_json_objects(test_string)
	assert objects == [
	{"1": 2},
	[1, 2, 3],
	4,
	"wow",
	]