Skip to content

Instantly share code, notes, and snippets.

@Twista
Created October 7, 2019 12:48
Show Gist options
  • Select an option

  • Save Twista/e0045f51dd19626e3f445a3b160307f0 to your computer and use it in GitHub Desktop.

Select an option

Save Twista/e0045f51dd19626e3f445a3b160307f0 to your computer and use it in GitHub Desktop.
TJ - parse and process large GeoJSON files with low memory overhead
"""
Utility for dealing with large (Geo) JSON files.
It provides fast and low-memory way how to process large files (tested on 1GB files, but will work for larger ones
Usage:
```python
import json
from tj import stream
file_name = "path/to/my.geojson"
for features in stream(file_name, key="features", chunk_size=2_000):
f = json.loads(features)
...
```
"""
JSON_LEFT_CURLY = '{'
JSON_RIGHT_CURLY = '}'
JSON_LEFT_ROUND = '['
JSON_RIGHT_ROUND = ']'
JSON_COMMA = ','
JSON_COLON = ':'
JSON_QUOTE = '"'
JSON_WHITESPACE = [' ', '\t', '\b', '\n', '\r']
JSON_SYNTAX = [JSON_COMMA, JSON_COLON, JSON_LEFT_ROUND, JSON_RIGHT_ROUND,
JSON_LEFT_CURLY, JSON_RIGHT_CURLY]
STRIP_CHARS = JSON_WHITESPACE + [JSON_QUOTE]
def _chunks(filename, buffer_size=4096):
with open(filename, "r") as fp:
chunk = fp.read(buffer_size)
while chunk:
yield chunk
chunk = fp.read(buffer_size)
def _chars(filename, buffer_size):
for chunk in _chunks(filename, buffer_size):
for char in chunk:
yield char
def _tokenize(s, buffer_size: int):
buffer = ""
waiting_for_char = False
for ch in _chars(s, buffer_size):
if ch in JSON_SYNTAX:
if waiting_for_char:
buffer = buffer.strip("".join(JSON_WHITESPACE))
if len(buffer): # skip empty chars
yield buffer
buffer = ""
waiting_for_char = False
yield ch
else:
# its not control character
if ch not in JSON_WHITESPACE or waiting_for_char is False:
buffer += ch
waiting_for_char = True
def _emit_features(features: list):
return "[" + ",".join(features) + "]"
def stream(filename: str, key: str = "features", buffer_size: int = 1_048_576, chunk_size: int = 2_000):
key = f'"{key}"' # as keys from json are streamed including quotes, ensure selector is quoted too
key_found = False
level = 0
features = []
counter = 0
buffer = ""
for token in _tokenize(filename, buffer_size):
# find a key we are looking for
if not key_found:
if token not in JSON_SYNTAX:
if key == token:
key_found = True
continue
# we already have a key -> start processing
if token == JSON_LEFT_CURLY:
level += 1
if token == JSON_RIGHT_CURLY:
level -= 1
if level == 0 and token in [JSON_COLON, JSON_COMMA, JSON_LEFT_ROUND, JSON_RIGHT_ROUND]:
# skip jsons syntax on main level
continue
buffer += token
if level == 0:
# back on root-level of selector -> object complete
features.append(buffer)
counter += 1
buffer = ""
if counter % chunk_size == 0:
yield _emit_features(features)
features = []
if level < 0:
# back on root-level of whole object -> break cycle
break
_emit_features(features)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment