Skip to content

Instantly share code, notes, and snippets.

@hoehrmann
Last active April 16, 2024 22:07
Show Gist options
  • Save hoehrmann/4118c131242302e016c03a12514d2b75 to your computer and use it in GitHub Desktop.
Save hoehrmann/4118c131242302e016c03a12514d2b75 to your computer and use it in GitHub Desktop.
Convert a JSON document into ndjson arrays with URI, RFC 9535 JSON Path (normalized and unique) and JSON value
import click
import ijson
import ijson.common
import json
import pathlib
import functools
def escape(s: str):
esc = {
"\u0000": "\\u0000",
"\u0001": "\\u0001",
"\u0002": "\\u0002",
"\u0003": "\\u0003",
"\u0004": "\\u0004",
"\u0005": "\\u0005",
"\u0006": "\\u0006",
"\u0007": "\\u0007",
"\u0008": "\\b",
"\u0009": "\\t",
"\u000a": "\\n",
"\u000b": "\\u000b",
"\u000c": "\\f",
"\u000d": "\\r",
"\u000e": "\\u000e",
"\u000f": "\\u000f",
"\u0010": "\\u0010",
"\u0011": "\\u0011",
"\u0012": "\\u0012",
"\u0013": "\\u0013",
"\u0014": "\\u0014",
"\u0015": "\\u0015",
"\u0016": "\\u0016",
"\u0017": "\\u0017",
"\u0018": "\\u0018",
"\u0019": "\\u0019",
"\u001a": "\\u001a",
"\u001b": "\\u001b",
"\u001c": "\\u001c",
"\u001d": "\\u001d",
"\u001e": "\\u001e",
"\u001f": "\\u001f",
"\u0027": "\\'",
"\u005c": "\\\\",
}
return "".join(
map(lambda ch: esc[ch] if ch in esc else ch, s)
)
def convert(source):
parser = ijson.basic_parse(source)
path = ["$"]
while True:
try:
event, value = next(parser)
if event == "start_map":
path.append(None)
elif event == "start_array":
path.append(0)
elif event == "end_map":
if path.pop() is None:
yield (path, {})
if isinstance(path[-1], int):
path[-1] += 1
elif event == "end_array":
if path.pop() == 0:
yield (path, [])
if isinstance(path[-1], int):
path[-1] += 1
elif event == "map_key":
path[-1] = "['" + escape(value) + "']"
else:
yield (path, value)
if isinstance(path[-1], int):
path[-1] += 1
except StopIteration:
break
@click.command()
@click.argument("file", type=click.File("r"))
def main(file: click.File):
"""
...
"""
path = ["$"]
def path_str(path):
def helper(value, element):
if isinstance(element, int):
return f"{value}[{element}]"
else:
return value + "." + element
return functools.reduce(helper, path)
file_uri = pathlib.Path(file.name).absolute().as_uri()
for path, value in convert(file):
print(json.dumps([file_uri, path_str(path), value]))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment