Created
October 7, 2019 12:48
-
-
Save Twista/e0045f51dd19626e3f445a3b160307f0 to your computer and use it in GitHub Desktop.
TJ - parse and process large GeoJSON files with low memory overhead
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Utility for dealing with large (Geo) JSON files. | |
| It provides fast and low-memory way how to process large files (tested on 1GB files, but will work for larger ones | |
| Usage: | |
| ```python | |
| import json | |
| from tj import stream | |
| file_name = "path/to/my.geojson" | |
| for features in stream(file_name, key="features", chunk_size=2_000): | |
| f = json.loads(features) | |
| ... | |
| ``` | |
| """ | |
| JSON_LEFT_CURLY = '{' | |
| JSON_RIGHT_CURLY = '}' | |
| JSON_LEFT_ROUND = '[' | |
| JSON_RIGHT_ROUND = ']' | |
| JSON_COMMA = ',' | |
| JSON_COLON = ':' | |
| JSON_QUOTE = '"' | |
| JSON_WHITESPACE = [' ', '\t', '\b', '\n', '\r'] | |
| JSON_SYNTAX = [JSON_COMMA, JSON_COLON, JSON_LEFT_ROUND, JSON_RIGHT_ROUND, | |
| JSON_LEFT_CURLY, JSON_RIGHT_CURLY] | |
| STRIP_CHARS = JSON_WHITESPACE + [JSON_QUOTE] | |
| def _chunks(filename, buffer_size=4096): | |
| with open(filename, "r") as fp: | |
| chunk = fp.read(buffer_size) | |
| while chunk: | |
| yield chunk | |
| chunk = fp.read(buffer_size) | |
| def _chars(filename, buffer_size): | |
| for chunk in _chunks(filename, buffer_size): | |
| for char in chunk: | |
| yield char | |
| def _tokenize(s, buffer_size: int): | |
| buffer = "" | |
| waiting_for_char = False | |
| for ch in _chars(s, buffer_size): | |
| if ch in JSON_SYNTAX: | |
| if waiting_for_char: | |
| buffer = buffer.strip("".join(JSON_WHITESPACE)) | |
| if len(buffer): # skip empty chars | |
| yield buffer | |
| buffer = "" | |
| waiting_for_char = False | |
| yield ch | |
| else: | |
| # its not control character | |
| if ch not in JSON_WHITESPACE or waiting_for_char is False: | |
| buffer += ch | |
| waiting_for_char = True | |
| def _emit_features(features: list): | |
| return "[" + ",".join(features) + "]" | |
| def stream(filename: str, key: str = "features", buffer_size: int = 1_048_576, chunk_size: int = 2_000): | |
| key = f'"{key}"' # as keys from json are streamed including quotes, ensure selector is quoted too | |
| key_found = False | |
| level = 0 | |
| features = [] | |
| counter = 0 | |
| buffer = "" | |
| for token in _tokenize(filename, buffer_size): | |
| # find a key we are looking for | |
| if not key_found: | |
| if token not in JSON_SYNTAX: | |
| if key == token: | |
| key_found = True | |
| continue | |
| # we already have a key -> start processing | |
| if token == JSON_LEFT_CURLY: | |
| level += 1 | |
| if token == JSON_RIGHT_CURLY: | |
| level -= 1 | |
| if level == 0 and token in [JSON_COLON, JSON_COMMA, JSON_LEFT_ROUND, JSON_RIGHT_ROUND]: | |
| # skip jsons syntax on main level | |
| continue | |
| buffer += token | |
| if level == 0: | |
| # back on root-level of selector -> object complete | |
| features.append(buffer) | |
| counter += 1 | |
| buffer = "" | |
| if counter % chunk_size == 0: | |
| yield _emit_features(features) | |
| features = [] | |
| if level < 0: | |
| # back on root-level of whole object -> break cycle | |
| break | |
| _emit_features(features) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment