Skip to content

Instantly share code, notes, and snippets.

@ryancollingwood
Created July 26, 2022 04:14
Show Gist options
  • Save ryancollingwood/4410087d528f81c0fceef2e174e97b6a to your computer and use it in GitHub Desktop.
Save ryancollingwood/4410087d528f81c0fceef2e174e97b6a to your computer and use it in GitHub Desktop.
flattens a JSON parsed obj into multiple lines. Last Element [-1] in each line is the value. Elements [0:-1] is the path to that value. line_prefix if specified will prefix a value to the begining of each outputted line. add_key_index will add 0-based index position when encoutering lists or tuples. remove_keys if a key (any point in the path) m…
def flatten_json_object_to_lines(obj, remove_keys=None, line_prefix=None, add_key_index=True):
'''
flattens a JSON parsed obj into multiple lines
Last Element [-1] in each line is the value
Elements [0:-1] is the path to that value
line_prefix if specified will prefix a value to the begining of each outputted line
add_key_index will add 0-based index position when encoutering lists or tuples
remove_keys if a key (any point in the path) matches then remove it
adapted from: https://thispointer.com/python-how-to-iterate-over-nested-dictionary-dict-of-dicts/
'''
def must_iterate_over(obj):
return isinstance(obj, list) or isinstance(obj, tuple) or isinstance(obj, dict)
if isinstance(obj, dict):
# Iterate over all key-value pairs of dict argument
for key, value in obj.items():
if remove_keys is not None:
if key in remove_keys:
continue
if line_prefix is not None:
out_key = (line_prefix, key)
else:
out_key = (key,)
if isinstance(value, dict):
for sub_index, pair in enumerate(flatten_json_object_to_lines(value, remove_keys=remove_keys, line_prefix=None, add_key_index=add_key_index)):
yield (out_key + (*pair,))
elif must_iterate_over(value):
for sub_index, sub_value in enumerate(value):
# out_key is already a tuple so unpack it
if add_key_index:
sub_out_key = (*out_key, sub_index)
else:
sub_out_key = (*out_key,)
if must_iterate_over(sub_value) or isinstance(sub_value, dict):
for pair in flatten_json_object_to_lines(sub_value, remove_keys=remove_keys, line_prefix=None, add_key_index=add_key_index):
yield (sub_out_key + (*pair,))
else:
yield (sub_out_key + (sub_value,))
else:
# If value is not dict or list/tuple type then yield the value
yield (out_key + (value,))
elif must_iterate_over(obj):
for index, item in enumerate(obj):
if line_prefix is not None:
out_key = line_prefix
if add_key_index:
out_key = (out_key, index)
else:
out_key = None
if add_key_index:
out_key = (index,)
for sub_item in flatten_json_object_to_lines(item, remove_keys=remove_keys, line_prefix=None, add_key_index=add_key_index):
if must_iterate_over(sub_item) or isinstance(sub_item, dict):
yield(out_key + (*sub_item,))
else:
yield(out_key + (sub_item,))
else:
yield obj
@ryancollingwood
Copy link
Author

For example given the following input for obj

test_data = [{
    'id': 190995,
    'slot': {'type': 'green', 'status': 'occupied'},
    'modifier_id': 889,
    'internal_slot_id': 0,
    'subclass': 4,
    'items': [1,2,3,4,5],
    "nested_items": {
        "items": [
            {
                "id": 1,
                "display": {"name": "gamma"},
            },
            {"id": 2},
            {"id": 3},
        ]
    }
}]


list(flatten_json_object_to_lines(test_data, line_prefix="data"))

We'll get, note that each line has been prefixed with "data" as specifed and because we passed in a list with add_key_index = True every line has the prefix along with the index of the element - given it was a list with only one dictionary it's returning zero

[
    ('data', 0, 'id', 190995), 
    ('data', 0, 'slot', 'type', 'green'), 
    ('data', 0, 'slot', 'status', 'occupied'), 
    ('data', 0, 'modifier_id', 889), 
    ('data', 0, 'internal_slot_id', 0), 
    ('data', 0, 'subclass', 4), 
    ('data', 0, 'items', 0, 1), 
    ('data', 0, 'items', 1, 2), 
    ('data', 0, 'items', 2, 3), 
    ('data', 0, 'items', 3, 4), 
    ('data', 0, 'items', 4, 5), 
    ('data', 0, 'nested_items', 'items', 0, 'id', 1), 
    ('data', 0, 'nested_items', 'items', 0, 'display', 'name', 'gamma'), 
    ('data', 0, 'nested_items', 'items', 1, 'id', 2), 
    ('data', 0, 'nested_items', 'items', 2, 'id', 3)
]

Some more examples of input and output
input

list(flatten_json_object_to_lines(
{
   "start":[
      {
         "inside_list":{
            "sub_dict":[
               1,
               2,
               3,
               {
                  "mayhem":"yolo"
               }
            ]
         }
      }
   ]
})
)

output

[
    ('start', 0, 'inside_list', 'sub_dict', 0, 1), 
    ('start', 0, 'inside_list', 'sub_dict', 1, 2), 
    ('start', 0, 'inside_list', 'sub_dict', 2, 3), 
    ('start', 0, 'inside_list', 'sub_dict', 3, 'mayhem', 'yolo')
]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment