-
-
Save micseydel/babcbb20f4301d8ab5ea5a2f98ccf16a to your computer and use it in GitHub Desktop.
Code for flattening jsonl files containing arrays into jsonl files without arrays.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
This script takes a file and an array name as command line parameters. | |
It iterates over the lines of the file, each of which must be valid JSON, | |
and flattens the arrays into multiple lines in a file FILENAME.flattened. | |
e.g. given lines like | |
{"_id":{"$oid":"52c1ab5ce4b05c06a14c79d9"}, "userId": 1, "race":["white", "asian"]} | |
will produce lines like | |
{"__index": 0, "context:" {"_id": {"$oid": "52c1ab5ce4b05c06a14c79d9"}, "userId": 1}, "element": "white"} | |
{"__index": 1, "context:" {"_id": {"$oid": "52c1ab5ce4b05c06a14c79d9"}, "userId": 1}, "element": "asian"} | |
The generated lines will always have exactly three keys: __index, context, element (where element may be a string, number or object) | |
""" | |
import os | |
import sys | |
import json | |
from copy import copy | |
from typing import Dict, Any, List | |
def remove_nested_element(containing_nested: Dict[str, Any], keys: List[str]) -> None: | |
final = keys[-1] | |
record = containing_nested | |
for key in keys[:-1]: | |
record = record[key] | |
del record[final] | |
def split(array_name: str, record: Dict[str, Any]) -> List[Any]: | |
""" | |
Given a record (a dict) and an array name (may be period-separated to indicate nesting), extracts the array from the record | |
and removes the most deeply nested key | |
e.g. "a.b.c" , {"_id": "glen", "a": {"b": {"c": [1, 2, 3], "d": "enied"}, "bb": 8}} | |
returns ([1, 2, 3], {"_id": "glen", "a": {"b": {"d": "nied"}, "bb": 8}}) | |
""" | |
parts = array_name.split('.') | |
subrecord = record | |
for part in parts: | |
try: | |
subrecord = subrecord[part] | |
except KeyError as e: | |
raise RuntimeError("{} did not contain {}".format(subrecord, part)) from e | |
assert isinstance(subrecord, list) | |
the_rest = copy(record) | |
remove_nested_element(the_rest, parts) | |
return subrecord, the_rest | |
def main(infile, array_name): | |
with open(infile, encoding="utf-8") as lines, open(infile + ".flattened", "w") as flattened: | |
for line in lines: | |
base_record = json.loads(line) | |
array, context = split(array_name, base_record) | |
for index, element in enumerate(array): | |
new_record = { | |
"__index": index, | |
"context": context, | |
"element": element, | |
} | |
flattened.write(json.dumps(new_record)) | |
flattened.write('\n') | |
if __name__ == "__main__": | |
if len(sys.argv) < 3: | |
print("Usage: ./flatten_arrays.py <lines.jsonl> <array_name>") | |
filename = sys.argv[1] | |
array_name = sys.argv[2] | |
main(filename, array_name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment