Created
November 4, 2012 18:32
-
-
Save rgaudin/4012915 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# vim: ai ts=4 sts=4 et sw=4 nu | |
import uuid | |
import copy | |
from collections import OrderedDict | |
import pandas as pd | |
recorda = {u'_attachments': [], | |
u'_bamboo_dataset_id': u'', | |
u'_deleted_at': None, | |
u'_geolocation': [None, None], | |
u'_id': 438852, | |
u'_status': u'submitted_via_web', | |
u'_submission_time': u'2012-10-26T10:56:30', | |
u'_uuid': u'427144a25cff4f3c8e76c041edaef979', | |
u'_xform_id_string': u'test_repeat_multiples', | |
u'age': u'32', | |
u'formhub/uuid': u'9ca652729404404a8e3dbed596332420', | |
u'hohh': u'reg', | |
u'wifes': [{u'wifes/children': [{u'wifes/children/age': u'3', | |
u'wifes/children/name': u'Salim'}, | |
{u'wifes/children/age': u'1', u'wifes/children/name': u'Soska'}, | |
{u'wifes/children/age': u'5', u'wifes/children/name': u'Fjj'}], | |
u'wifes/wife_name': u'Goundo'}, | |
{u'wifes/children': [{u'wifes/children/age': u'7', | |
u'wifes/children/name': u'Abdp'}, | |
{u'wifes/children/age': u'2', u'wifes/children/name': u'Rokia'}], | |
u'wifes/wife_name': u'Haja'}]} | |
recordb = {u'_attachments': [], | |
u'_bamboo_dataset_id': u'', | |
u'_deleted_at': None, | |
u'_geolocation': [None, None], | |
u'_id': 438855, | |
u'_status': u'submitted_via_web', | |
u'_submission_time': u'2012-10-29T06:44:43', | |
u'_uuid': u'04d3170da13341b5bcdd4bcfe58b5193', | |
u'_xform_id_string': u'test_repeat_multiples', | |
u'age': u'25', | |
u'formhub/uuid': u'9ca652729404404a8e3dbed596332420', | |
u'hohh': u'Gaz', | |
u'wifes': [{u'wifes/wife_name': u'Ryi'}, | |
{u'wifes/children': [{u'wifes/children/age': u'2', | |
u'wifes/children/name': u'Tre'}, | |
{u'wifes/children/age': u'5', | |
u'wifes/children/name': u'Cfe'}], | |
u'wifes/wife_name': u'Hyo'}]} | |
recordc = {u'_attachments': [], | |
u'_bamboo_dataset_id': u'', | |
u'_deleted_at': None, | |
u'_geolocation': [None, None], | |
u'_id': 438854, | |
u'_status': u'submitted_via_web', | |
u'_submission_time': u'2012-10-29T06:44:42', | |
u'_uuid': u'0ac9d53235a2495bbf579adda47c599f', | |
u'_xform_id_string': u'test_repeat_multiples', | |
u'age': u'57', | |
u'formhub/uuid': u'9ca652729404404a8e3dbed596332420', | |
u'hohh': u'tip'} | |
recordd = {u'_attachments': [], | |
u'_bamboo_dataset_id': u'', | |
u'_deleted_at': None, | |
u'_geolocation': [None, None], | |
u'_id': 438853, | |
u'_status': u'submitted_via_web', | |
u'_submission_time': u'2012-10-26T10:58:36', | |
u'_uuid': u'5ecefec6b9d44247949b0b1637642731', | |
u'_xform_id_string': u'test_repeat_multiples', | |
u'age': u'37', | |
u'formhub/uuid': u'9ca652729404404a8e3dbed596332420', | |
u'hohh': u'Ali', | |
u'wifes': [{u'wifes/children': [{u'wifes/children/age': u'1', | |
u'wifes/children/name': u'Chya'}], | |
u'wifes/wife_name': u'Ba'}]} | |
recorda = {u'_attachments': [], | |
u'_bamboo_dataset_id': u'', | |
u'_deleted_at': None, | |
u'_geolocation': [None, None], | |
u'_id': 438858, | |
u'_status': u'submitted_via_web', | |
u'_submission_time': u'2012-10-29T08:46:15', | |
u'_uuid': u'99e029809e2b41919e27ca900cfb7377', | |
u'_xform_id_string': u'test_double_repeat_multiples', | |
u'age': u'26', | |
u'browser': [{u'browser/name': u'ie'}, | |
{u'browser/name': u'firefox'}, | |
{u'browser/name': u'chrome'}, | |
{u'browser/name': u'chromium'}], | |
u'formhub/uuid': u'08230bc26da247c1bb2a9988016dfdcb', | |
u'hohh': u'Fad', | |
u'wifes': [{u'wifes/children': [{u'wifes/children/age': u'1', | |
u'wifes/children/name': u'Assi'}], | |
u'wifes/wife_name': u'Soum'}]} | |
recordb = {u'_attachments': [], | |
u'_bamboo_dataset_id': u'', | |
u'_deleted_at': None, | |
u'_geolocation': [None, None], | |
u'_id': 438856, | |
u'_status': u'submitted_via_web', | |
u'_submission_time': u'2012-10-29T08:46:14', | |
u'_uuid': u'007487526a7c45cfa1d44025057e434d', | |
u'_xform_id_string': u'test_double_repeat_multiples', | |
u'age': u'10', | |
u'browser': [{u'browser/name': u'firefox'}, {u'browser/name': u'chromium'}], | |
u'formhub/uuid': u'08230bc26da247c1bb2a9988016dfdcb', | |
u'hohh': u'Reg', | |
u'wifes': [{u'wifes/children': [{u'wifes/children/age': u'3', | |
u'wifes/children/name': u'Salim'}, | |
{u'wifes/children/age': u'1', | |
u'wifes/children/name': u'Siska'}], | |
u'wifes/wife_name': u'Goun'}, | |
{u'wifes/children': [{u'wifes/children/age': u'4', | |
u'wifes/children/name': u'Kadi'}], | |
u'wifes/wife_name': u'Aja'}, | |
{u'wifes/wife_name': u'Awa'}]} | |
recordc = {u'_attachments': [], | |
u'_bamboo_dataset_id': u'', | |
u'_deleted_at': None, | |
u'_geolocation': [None, None], | |
u'_id': 438857, | |
u'_status': u'submitted_via_web', | |
u'_submission_time': u'2012-10-29T08:46:15', | |
u'_uuid': u'465ab1843dd84563a076c68b34e26208', | |
u'_xform_id_string': u'test_double_repeat_multiples', | |
u'age': u'35', | |
u'formhub/uuid': u'08230bc26da247c1bb2a9988016dfdcb', | |
u'hohh': u'Ali', | |
u'wifes': [{u'wifes/children': [{u'wifes/children/age': u'1', | |
u'wifes/children/name': u'Chya'}], | |
u'wifes/wife_name': u'Ba'}]} | |
recordd = {u'_attachments': [], | |
u'_bamboo_dataset_id': u'', | |
u'_deleted_at': None, | |
u'_geolocation': [None, None], | |
u'_id': 438859, | |
u'_status': u'submitted_via_web', | |
u'_submission_time': u'2012-10-29T08:46:16', | |
u'_uuid': u'a06126d70b454915971a8d196e187e8a', | |
u'_xform_id_string': u'test_double_repeat_multiples', | |
u'age': u'27', | |
u'browser': [{u'browser/name': u'firefox'}], | |
u'formhub/uuid': u'08230bc26da247c1bb2a9988016dfdcb', | |
u'hohh': u'Alou', | |
u'wifes': [{u'wifes/wife_name': u'Kadi'}]} | |
columns = OrderedDict([(u'formhub/uuid', None), | |
(u'_uuid', None), | |
(u'hohh', None), | |
(u'age', None), | |
(u'wifes', []), | |
(u'wifes/wife_name', None), | |
(u'wifes/children/name', None), | |
(u'wifes/children/age', None)]) | |
columns = OrderedDict([(u'formhub/uuid', None), | |
(u'_uuid', None), | |
(u'hohh', None), (u'age', None), | |
(u'wifes', []), (u'wifes/wife_name', None), | |
(u'wifes/children', []), | |
(u'wifes/children/name', None), | |
(u'wifes/children/age', None), | |
(u'browser', []), | |
(u'browser/name', None)]) | |
# build a list of repeating sections and pop them out of columns | |
repeat_columns = [column if columns.pop(column, column) else column | |
for column, column_type in columns.items() | |
if isinstance(column_type, list)] | |
from pprint import pprint as pp ; pp(repeat_columns) | |
from pprint import pprint as pp ; pp(columns) | |
def dataframe_for_record(record): | |
rows = {} | |
def create_new_row(data, record, columns, depth): | |
row_id = uuid.uuid4().hex | |
row = copy.copy(data) | |
row.update({'depth': depth}) | |
for column in columns: | |
if column in record: | |
row.update({column: record.get(column)}) | |
if row_id in rows.keys(): | |
return rows.get(row_id) | |
rows.update({row_id: row}) | |
return row | |
def update_row(row, key, value): | |
row.update({key: value}) | |
def traverse_repeat_sections(iterobj, columns, row, record, depth): | |
if isinstance(iterobj, dict): | |
for key, value in iterobj.items(): | |
if key in columns: | |
# we have a wanted data | |
# we know it's not a list | |
update_row(row, key, value) | |
for key, value in iterobj.items(): | |
if isinstance(value, (list, dict)): | |
# somes values are dict/list. | |
# might contain wanted data | |
if isinstance(value, list): | |
d = depth + 1 | |
else: | |
d = depth | |
traverse_repeat_sections(value, columns, row, record, d) | |
elif isinstance(iterobj, list): | |
for item in iterobj: | |
# create a row for each | |
if row.get('depth') != depth: | |
nrow = create_new_row(row, record, columns, depth) | |
else: | |
nrow = row | |
traverse_repeat_sections(item, columns, nrow, record, depth + 1) | |
else: | |
print('??? %s' % iterobj) | |
# construct rows for each repeat sections | |
for repeat_section in repeat_columns: | |
traverse_repeat_sections(record.get(repeat_section), | |
columns.keys(), {}, record, 0) | |
# if we have no rows yet (no repeat sections), let's build a simpe one. | |
if not len(rows): | |
create_new_row({}, record, columns, 0) | |
def cleanup_intermediate_rows(rows, max_depth): | |
ids = list(rows.keys()) | |
for row_id in ids: | |
del(rows[row_id]['depth']) | |
# get max depth | |
max_depth = max([d.get('depth', 0) for d in rows.values()]) | |
# remove depth column and intermediate rows | |
cleanup_intermediate_rows(rows, max_depth) | |
series = [] | |
for row_id, row in rows.items(): | |
s = pd.Series(row.values(), index=row.keys()) | |
series.append(s) | |
return pd.DataFrame(series) | |
main_df = None | |
dfs = [] | |
for record in [recorda, recordb, recordc, recordd]: | |
df = dataframe_for_record(record) | |
dfs.append(df) | |
from pprint import pprint as pp ; pp(df) | |
if main_df is None: | |
main_df = df | |
else: | |
main_df = pd.concat([main_df, df], join='outer', ignore_index=True) | |
print('############') | |
# main_df = main_df.dropna(axis=1) | |
from pprint import pprint as pp ; pp(main_df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment