micseydel/flatten_arrays.py Secret

## flatten_arrays.py
#!/usr/bin/env python3

"""
This script takes a file and an array name as command line parameters.
It iterates over the lines of the file, each of which must be valid JSON,
and flattens the arrays into multiple lines in a file FILENAME.flattened.

e.g. given lines like

{"_id":{"$oid":"52c1ab5ce4b05c06a14c79d9"}, "userId": 1, "race":["white", "asian"]}

will produce lines like

{"__index": 0, "context:" {"_id": {"$oid": "52c1ab5ce4b05c06a14c79d9"}, "userId": 1}, "element": "white"}
{"__index": 1, "context:" {"_id": {"$oid": "52c1ab5ce4b05c06a14c79d9"}, "userId": 1}, "element": "asian"}

The generated lines will always have exactly three keys: __index, context, element (where element may be a string, number or object)
"""

import os
import sys
import json
from copy import copy

from typing import Dict, Any, List

def remove_nested_element(containing_nested: Dict[str, Any], keys: List[str]) -> None:
    final = keys[-1]

    record = containing_nested
    for key in keys[:-1]:
        record = record[key]

    del record[final]


def split(array_name: str, record: Dict[str, Any]) -> List[Any]:
    """
    Given a record (a dict) and an array name (may be period-separated to indicate nesting), extracts the array from the record
    and removes the most deeply nested key

    e.g. "a.b.c" , {"_id": "glen", "a": {"b": {"c": [1, 2, 3], "d": "enied"}, "bb": 8}}
    returns ([1, 2, 3], {"_id": "glen", "a": {"b": {"d": "nied"}, "bb": 8}})
    """
    parts = array_name.split('.')

    subrecord = record
    for part in parts:
        try:
            subrecord = subrecord[part]
        except KeyError as e:
            raise RuntimeError("{} did not contain {}".format(subrecord, part)) from e

    assert isinstance(subrecord, list)

    the_rest = copy(record)
    remove_nested_element(the_rest, parts)

    return subrecord, the_rest


def main(infile, array_name):
    with open(infile, encoding="utf-8") as lines, open(infile + ".flattened", "w") as flattened:
        for line in lines:
            base_record = json.loads(line)

            array, context = split(array_name, base_record)

            for index, element in enumerate(array):
                new_record = {
                    "__index": index,
                    "context": context,
                    "element": element,
                }

                flattened.write(json.dumps(new_record))
                flattened.write('\n')


if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("Usage: ./flatten_arrays.py <lines.jsonl> <array_name>")

    filename = sys.argv[1]
    array_name = sys.argv[2]
    main(filename, array_name)
	#!/usr/bin/env python3

	"""
	This script takes a file and an array name as command line parameters.
	It iterates over the lines of the file, each of which must be valid JSON,
	and flattens the arrays into multiple lines in a file FILENAME.flattened.

	e.g. given lines like

	{"_id":{"$oid":"52c1ab5ce4b05c06a14c79d9"}, "userId": 1, "race":["white", "asian"]}

	will produce lines like

	{"__index": 0, "context:" {"_id": {"$oid": "52c1ab5ce4b05c06a14c79d9"}, "userId": 1}, "element": "white"}
	{"__index": 1, "context:" {"_id": {"$oid": "52c1ab5ce4b05c06a14c79d9"}, "userId": 1}, "element": "asian"}

	The generated lines will always have exactly three keys: __index, context, element (where element may be a string, number or object)
	"""

	import os
	import sys
	import json
	from copy import copy

	from typing import Dict, Any, List

	def remove_nested_element(containing_nested: Dict[str, Any], keys: List[str]) -> None:
	final = keys[-1]

	record = containing_nested
	for key in keys[:-1]:
	record = record[key]

	del record[final]


	def split(array_name: str, record: Dict[str, Any]) -> List[Any]:
	"""
	Given a record (a dict) and an array name (may be period-separated to indicate nesting), extracts the array from the record
	and removes the most deeply nested key

	e.g. "a.b.c" , {"_id": "glen", "a": {"b": {"c": [1, 2, 3], "d": "enied"}, "bb": 8}}
	returns ([1, 2, 3], {"_id": "glen", "a": {"b": {"d": "nied"}, "bb": 8}})
	"""
	parts = array_name.split('.')

	subrecord = record
	for part in parts:
	try:
	subrecord = subrecord[part]
	except KeyError as e:
	raise RuntimeError("{} did not contain {}".format(subrecord, part)) from e

	assert isinstance(subrecord, list)

	the_rest = copy(record)
	remove_nested_element(the_rest, parts)

	return subrecord, the_rest


	def main(infile, array_name):
	with open(infile, encoding="utf-8") as lines, open(infile + ".flattened", "w") as flattened:
	for line in lines:
	base_record = json.loads(line)

	array, context = split(array_name, base_record)

	for index, element in enumerate(array):
	new_record = {
	"__index": index,
	"context": context,
	"element": element,
	}

	flattened.write(json.dumps(new_record))
	flattened.write('\n')


	if __name__ == "__main__":
	if len(sys.argv) < 3:
	print("Usage: ./flatten_arrays.py <lines.jsonl> <array_name>")

	filename = sys.argv[1]
	array_name = sys.argv[2]
	main(filename, array_name)