Skip to content

Instantly share code, notes, and snippets.

@rgaudin
Created November 4, 2012 18:32
Show Gist options
  • Save rgaudin/4012915 to your computer and use it in GitHub Desktop.
Save rgaudin/4012915 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: ai ts=4 sts=4 et sw=4 nu
import uuid
import copy
from collections import OrderedDict
import pandas as pd
recorda = {u'_attachments': [],
u'_bamboo_dataset_id': u'',
u'_deleted_at': None,
u'_geolocation': [None, None],
u'_id': 438852,
u'_status': u'submitted_via_web',
u'_submission_time': u'2012-10-26T10:56:30',
u'_uuid': u'427144a25cff4f3c8e76c041edaef979',
u'_xform_id_string': u'test_repeat_multiples',
u'age': u'32',
u'formhub/uuid': u'9ca652729404404a8e3dbed596332420',
u'hohh': u'reg',
u'wifes': [{u'wifes/children': [{u'wifes/children/age': u'3',
u'wifes/children/name': u'Salim'},
{u'wifes/children/age': u'1', u'wifes/children/name': u'Soska'},
{u'wifes/children/age': u'5', u'wifes/children/name': u'Fjj'}],
u'wifes/wife_name': u'Goundo'},
{u'wifes/children': [{u'wifes/children/age': u'7',
u'wifes/children/name': u'Abdp'},
{u'wifes/children/age': u'2', u'wifes/children/name': u'Rokia'}],
u'wifes/wife_name': u'Haja'}]}
recordb = {u'_attachments': [],
u'_bamboo_dataset_id': u'',
u'_deleted_at': None,
u'_geolocation': [None, None],
u'_id': 438855,
u'_status': u'submitted_via_web',
u'_submission_time': u'2012-10-29T06:44:43',
u'_uuid': u'04d3170da13341b5bcdd4bcfe58b5193',
u'_xform_id_string': u'test_repeat_multiples',
u'age': u'25',
u'formhub/uuid': u'9ca652729404404a8e3dbed596332420',
u'hohh': u'Gaz',
u'wifes': [{u'wifes/wife_name': u'Ryi'},
{u'wifes/children': [{u'wifes/children/age': u'2',
u'wifes/children/name': u'Tre'},
{u'wifes/children/age': u'5',
u'wifes/children/name': u'Cfe'}],
u'wifes/wife_name': u'Hyo'}]}
recordc = {u'_attachments': [],
u'_bamboo_dataset_id': u'',
u'_deleted_at': None,
u'_geolocation': [None, None],
u'_id': 438854,
u'_status': u'submitted_via_web',
u'_submission_time': u'2012-10-29T06:44:42',
u'_uuid': u'0ac9d53235a2495bbf579adda47c599f',
u'_xform_id_string': u'test_repeat_multiples',
u'age': u'57',
u'formhub/uuid': u'9ca652729404404a8e3dbed596332420',
u'hohh': u'tip'}
recordd = {u'_attachments': [],
u'_bamboo_dataset_id': u'',
u'_deleted_at': None,
u'_geolocation': [None, None],
u'_id': 438853,
u'_status': u'submitted_via_web',
u'_submission_time': u'2012-10-26T10:58:36',
u'_uuid': u'5ecefec6b9d44247949b0b1637642731',
u'_xform_id_string': u'test_repeat_multiples',
u'age': u'37',
u'formhub/uuid': u'9ca652729404404a8e3dbed596332420',
u'hohh': u'Ali',
u'wifes': [{u'wifes/children': [{u'wifes/children/age': u'1',
u'wifes/children/name': u'Chya'}],
u'wifes/wife_name': u'Ba'}]}
recorda = {u'_attachments': [],
u'_bamboo_dataset_id': u'',
u'_deleted_at': None,
u'_geolocation': [None, None],
u'_id': 438858,
u'_status': u'submitted_via_web',
u'_submission_time': u'2012-10-29T08:46:15',
u'_uuid': u'99e029809e2b41919e27ca900cfb7377',
u'_xform_id_string': u'test_double_repeat_multiples',
u'age': u'26',
u'browser': [{u'browser/name': u'ie'},
{u'browser/name': u'firefox'},
{u'browser/name': u'chrome'},
{u'browser/name': u'chromium'}],
u'formhub/uuid': u'08230bc26da247c1bb2a9988016dfdcb',
u'hohh': u'Fad',
u'wifes': [{u'wifes/children': [{u'wifes/children/age': u'1',
u'wifes/children/name': u'Assi'}],
u'wifes/wife_name': u'Soum'}]}
recordb = {u'_attachments': [],
u'_bamboo_dataset_id': u'',
u'_deleted_at': None,
u'_geolocation': [None, None],
u'_id': 438856,
u'_status': u'submitted_via_web',
u'_submission_time': u'2012-10-29T08:46:14',
u'_uuid': u'007487526a7c45cfa1d44025057e434d',
u'_xform_id_string': u'test_double_repeat_multiples',
u'age': u'10',
u'browser': [{u'browser/name': u'firefox'}, {u'browser/name': u'chromium'}],
u'formhub/uuid': u'08230bc26da247c1bb2a9988016dfdcb',
u'hohh': u'Reg',
u'wifes': [{u'wifes/children': [{u'wifes/children/age': u'3',
u'wifes/children/name': u'Salim'},
{u'wifes/children/age': u'1',
u'wifes/children/name': u'Siska'}],
u'wifes/wife_name': u'Goun'},
{u'wifes/children': [{u'wifes/children/age': u'4',
u'wifes/children/name': u'Kadi'}],
u'wifes/wife_name': u'Aja'},
{u'wifes/wife_name': u'Awa'}]}
recordc = {u'_attachments': [],
u'_bamboo_dataset_id': u'',
u'_deleted_at': None,
u'_geolocation': [None, None],
u'_id': 438857,
u'_status': u'submitted_via_web',
u'_submission_time': u'2012-10-29T08:46:15',
u'_uuid': u'465ab1843dd84563a076c68b34e26208',
u'_xform_id_string': u'test_double_repeat_multiples',
u'age': u'35',
u'formhub/uuid': u'08230bc26da247c1bb2a9988016dfdcb',
u'hohh': u'Ali',
u'wifes': [{u'wifes/children': [{u'wifes/children/age': u'1',
u'wifes/children/name': u'Chya'}],
u'wifes/wife_name': u'Ba'}]}
recordd = {u'_attachments': [],
u'_bamboo_dataset_id': u'',
u'_deleted_at': None,
u'_geolocation': [None, None],
u'_id': 438859,
u'_status': u'submitted_via_web',
u'_submission_time': u'2012-10-29T08:46:16',
u'_uuid': u'a06126d70b454915971a8d196e187e8a',
u'_xform_id_string': u'test_double_repeat_multiples',
u'age': u'27',
u'browser': [{u'browser/name': u'firefox'}],
u'formhub/uuid': u'08230bc26da247c1bb2a9988016dfdcb',
u'hohh': u'Alou',
u'wifes': [{u'wifes/wife_name': u'Kadi'}]}
columns = OrderedDict([(u'formhub/uuid', None),
(u'_uuid', None),
(u'hohh', None),
(u'age', None),
(u'wifes', []),
(u'wifes/wife_name', None),
(u'wifes/children/name', None),
(u'wifes/children/age', None)])
columns = OrderedDict([(u'formhub/uuid', None),
(u'_uuid', None),
(u'hohh', None), (u'age', None),
(u'wifes', []), (u'wifes/wife_name', None),
(u'wifes/children', []),
(u'wifes/children/name', None),
(u'wifes/children/age', None),
(u'browser', []),
(u'browser/name', None)])
# build a list of repeating sections and pop them out of columns
repeat_columns = [column if columns.pop(column, column) else column
for column, column_type in columns.items()
if isinstance(column_type, list)]
from pprint import pprint as pp ; pp(repeat_columns)
from pprint import pprint as pp ; pp(columns)
def dataframe_for_record(record):
rows = {}
def create_new_row(data, record, columns, depth):
row_id = uuid.uuid4().hex
row = copy.copy(data)
row.update({'depth': depth})
for column in columns:
if column in record:
row.update({column: record.get(column)})
if row_id in rows.keys():
return rows.get(row_id)
rows.update({row_id: row})
return row
def update_row(row, key, value):
row.update({key: value})
def traverse_repeat_sections(iterobj, columns, row, record, depth):
if isinstance(iterobj, dict):
for key, value in iterobj.items():
if key in columns:
# we have a wanted data
# we know it's not a list
update_row(row, key, value)
for key, value in iterobj.items():
if isinstance(value, (list, dict)):
# somes values are dict/list.
# might contain wanted data
if isinstance(value, list):
d = depth + 1
else:
d = depth
traverse_repeat_sections(value, columns, row, record, d)
elif isinstance(iterobj, list):
for item in iterobj:
# create a row for each
if row.get('depth') != depth:
nrow = create_new_row(row, record, columns, depth)
else:
nrow = row
traverse_repeat_sections(item, columns, nrow, record, depth + 1)
else:
print('??? %s' % iterobj)
# construct rows for each repeat sections
for repeat_section in repeat_columns:
traverse_repeat_sections(record.get(repeat_section),
columns.keys(), {}, record, 0)
# if we have no rows yet (no repeat sections), let's build a simpe one.
if not len(rows):
create_new_row({}, record, columns, 0)
def cleanup_intermediate_rows(rows, max_depth):
ids = list(rows.keys())
for row_id in ids:
del(rows[row_id]['depth'])
# get max depth
max_depth = max([d.get('depth', 0) for d in rows.values()])
# remove depth column and intermediate rows
cleanup_intermediate_rows(rows, max_depth)
series = []
for row_id, row in rows.items():
s = pd.Series(row.values(), index=row.keys())
series.append(s)
return pd.DataFrame(series)
main_df = None
dfs = []
for record in [recorda, recordb, recordc, recordd]:
df = dataframe_for_record(record)
dfs.append(df)
from pprint import pprint as pp ; pp(df)
if main_df is None:
main_df = df
else:
main_df = pd.concat([main_df, df], join='outer', ignore_index=True)
print('############')
# main_df = main_df.dropna(axis=1)
from pprint import pprint as pp ; pp(main_df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment