rgaudin/flat_tweaks.py

## flat_tweaks.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: ai ts=4 sts=4 et sw=4 nu

import uuid
import copy
from collections import OrderedDict

import pandas as pd

recorda = {u'_attachments': [],
 u'_bamboo_dataset_id': u'',
 u'_deleted_at': None,
 u'_geolocation': [None, None],
 u'_id': 438852,
 u'_status': u'submitted_via_web',
 u'_submission_time': u'2012-10-26T10:56:30',
 u'_uuid': u'427144a25cff4f3c8e76c041edaef979',
 u'_xform_id_string': u'test_repeat_multiples',
 u'age': u'32',
 u'formhub/uuid': u'9ca652729404404a8e3dbed596332420',
 u'hohh': u'reg',
 u'wifes': [{u'wifes/children': [{u'wifes/children/age': u'3',
     u'wifes/children/name': u'Salim'},
    {u'wifes/children/age': u'1', u'wifes/children/name': u'Soska'},
    {u'wifes/children/age': u'5', u'wifes/children/name': u'Fjj'}],
   u'wifes/wife_name': u'Goundo'},
  {u'wifes/children': [{u'wifes/children/age': u'7',
     u'wifes/children/name': u'Abdp'},
    {u'wifes/children/age': u'2', u'wifes/children/name': u'Rokia'}],
   u'wifes/wife_name': u'Haja'}]}

recordb = {u'_attachments': [],
 u'_bamboo_dataset_id': u'',
 u'_deleted_at': None,
 u'_geolocation': [None, None],
 u'_id': 438855,
 u'_status': u'submitted_via_web',
 u'_submission_time': u'2012-10-29T06:44:43',
 u'_uuid': u'04d3170da13341b5bcdd4bcfe58b5193',
 u'_xform_id_string': u'test_repeat_multiples',
 u'age': u'25',
 u'formhub/uuid': u'9ca652729404404a8e3dbed596332420',
 u'hohh': u'Gaz',
 u'wifes': [{u'wifes/wife_name': u'Ryi'},
            {u'wifes/children': [{u'wifes/children/age': u'2',
                                  u'wifes/children/name': u'Tre'},
                                 {u'wifes/children/age': u'5',
                                  u'wifes/children/name': u'Cfe'}],
             u'wifes/wife_name': u'Hyo'}]}

recordc = {u'_attachments': [],
 u'_bamboo_dataset_id': u'',
 u'_deleted_at': None,
 u'_geolocation': [None, None],
 u'_id': 438854,
 u'_status': u'submitted_via_web',
 u'_submission_time': u'2012-10-29T06:44:42',
 u'_uuid': u'0ac9d53235a2495bbf579adda47c599f',
 u'_xform_id_string': u'test_repeat_multiples',
 u'age': u'57',
 u'formhub/uuid': u'9ca652729404404a8e3dbed596332420',
 u'hohh': u'tip'}

recordd = {u'_attachments': [],
 u'_bamboo_dataset_id': u'',
 u'_deleted_at': None,
 u'_geolocation': [None, None],
 u'_id': 438853,
 u'_status': u'submitted_via_web',
 u'_submission_time': u'2012-10-26T10:58:36',
 u'_uuid': u'5ecefec6b9d44247949b0b1637642731',
 u'_xform_id_string': u'test_repeat_multiples',
 u'age': u'37',
 u'formhub/uuid': u'9ca652729404404a8e3dbed596332420',
 u'hohh': u'Ali',
 u'wifes': [{u'wifes/children': [{u'wifes/children/age': u'1',
                                  u'wifes/children/name': u'Chya'}],
             u'wifes/wife_name': u'Ba'}]}


recorda = {u'_attachments': [],
 u'_bamboo_dataset_id': u'',
 u'_deleted_at': None,
 u'_geolocation': [None, None],
 u'_id': 438858,
 u'_status': u'submitted_via_web',
 u'_submission_time': u'2012-10-29T08:46:15',
 u'_uuid': u'99e029809e2b41919e27ca900cfb7377',
 u'_xform_id_string': u'test_double_repeat_multiples',
 u'age': u'26',
 u'browser': [{u'browser/name': u'ie'},
              {u'browser/name': u'firefox'},
              {u'browser/name': u'chrome'},
              {u'browser/name': u'chromium'}],
 u'formhub/uuid': u'08230bc26da247c1bb2a9988016dfdcb',
 u'hohh': u'Fad',
 u'wifes': [{u'wifes/children': [{u'wifes/children/age': u'1',
                                  u'wifes/children/name': u'Assi'}],
             u'wifes/wife_name': u'Soum'}]}


recordb = {u'_attachments': [],
 u'_bamboo_dataset_id': u'',
 u'_deleted_at': None,
 u'_geolocation': [None, None],
 u'_id': 438856,
 u'_status': u'submitted_via_web',
 u'_submission_time': u'2012-10-29T08:46:14',
 u'_uuid': u'007487526a7c45cfa1d44025057e434d',
 u'_xform_id_string': u'test_double_repeat_multiples',
 u'age': u'10',
 u'browser': [{u'browser/name': u'firefox'}, {u'browser/name': u'chromium'}],
 u'formhub/uuid': u'08230bc26da247c1bb2a9988016dfdcb',
 u'hohh': u'Reg',
 u'wifes': [{u'wifes/children': [{u'wifes/children/age': u'3',
                                  u'wifes/children/name': u'Salim'},
                                 {u'wifes/children/age': u'1',
                                  u'wifes/children/name': u'Siska'}],
             u'wifes/wife_name': u'Goun'},
            {u'wifes/children': [{u'wifes/children/age': u'4',
                                  u'wifes/children/name': u'Kadi'}],
             u'wifes/wife_name': u'Aja'},
            {u'wifes/wife_name': u'Awa'}]}
recordc = {u'_attachments': [],
 u'_bamboo_dataset_id': u'',
 u'_deleted_at': None,
 u'_geolocation': [None, None],
 u'_id': 438857,
 u'_status': u'submitted_via_web',
 u'_submission_time': u'2012-10-29T08:46:15',
 u'_uuid': u'465ab1843dd84563a076c68b34e26208',
 u'_xform_id_string': u'test_double_repeat_multiples',
 u'age': u'35',
 u'formhub/uuid': u'08230bc26da247c1bb2a9988016dfdcb',
 u'hohh': u'Ali',
 u'wifes': [{u'wifes/children': [{u'wifes/children/age': u'1',
                                  u'wifes/children/name': u'Chya'}],
             u'wifes/wife_name': u'Ba'}]}

recordd = {u'_attachments': [],
 u'_bamboo_dataset_id': u'',
 u'_deleted_at': None,
 u'_geolocation': [None, None],
 u'_id': 438859,
 u'_status': u'submitted_via_web',
 u'_submission_time': u'2012-10-29T08:46:16',
 u'_uuid': u'a06126d70b454915971a8d196e187e8a',
 u'_xform_id_string': u'test_double_repeat_multiples',
 u'age': u'27',
 u'browser': [{u'browser/name': u'firefox'}],
 u'formhub/uuid': u'08230bc26da247c1bb2a9988016dfdcb',
 u'hohh': u'Alou',
 u'wifes': [{u'wifes/wife_name': u'Kadi'}]}

columns = OrderedDict([(u'formhub/uuid', None),
                       (u'_uuid', None),
                       (u'hohh', None),
                       (u'age', None),
                       (u'wifes', []),
                       (u'wifes/wife_name', None),
                       (u'wifes/children/name', None),
                       (u'wifes/children/age', None)])


columns = OrderedDict([(u'formhub/uuid', None),
                       (u'_uuid', None),
                       (u'hohh', None), (u'age', None),
                       (u'wifes', []), (u'wifes/wife_name', None),
                       (u'wifes/children', []),
                       (u'wifes/children/name', None),
                       (u'wifes/children/age', None),
                       (u'browser', []),
                       (u'browser/name', None)])


# build a list of repeating sections and pop them out of columns
repeat_columns = [column if columns.pop(column, column) else column
                  for column, column_type in columns.items()
                  if isinstance(column_type, list)]

from pprint import pprint as pp ; pp(repeat_columns)
from pprint import pprint as pp ; pp(columns)


def dataframe_for_record(record):

    rows = {}

    def create_new_row(data, record, columns, depth):

        row_id = uuid.uuid4().hex
        row = copy.copy(data)
        row.update({'depth': depth})

        for column in columns:
            if column in record:
                row.update({column: record.get(column)})

        if row_id in rows.keys():
            return rows.get(row_id)

        rows.update({row_id: row})
        return row

    def update_row(row, key, value):
        row.update({key: value})

    def traverse_repeat_sections(iterobj, columns, row, record, depth):

        if isinstance(iterobj, dict):
            for key, value in iterobj.items():
                if key in columns:
                    # we have a wanted data
                    # we know it's not a list
                    update_row(row, key, value)
            for key, value in iterobj.items():
                if isinstance(value, (list, dict)):
                    # somes values are dict/list.
                    # might contain wanted data
                    if isinstance(value, list):
                        d = depth + 1
                    else:
                        d = depth
                    traverse_repeat_sections(value, columns, row, record, d)

        elif isinstance(iterobj, list):
            for item in iterobj:
                # create a row for each
                if row.get('depth') != depth:
                    nrow = create_new_row(row, record, columns, depth)
                else:
                    nrow = row
                traverse_repeat_sections(item, columns, nrow, record, depth + 1)
        else:
            print('??? %s' % iterobj)

    # construct rows for each repeat sections
    for repeat_section in repeat_columns:
        traverse_repeat_sections(record.get(repeat_section),
                                 columns.keys(), {}, record, 0)

    # if we have no rows yet (no repeat sections), let's build a simpe one.
    if not len(rows):
        create_new_row({}, record, columns, 0)

    def cleanup_intermediate_rows(rows, max_depth):
        ids = list(rows.keys())
        for row_id in ids:
            del(rows[row_id]['depth'])

    # get max depth
    max_depth = max([d.get('depth', 0) for d in rows.values()])

    # remove depth column and intermediate rows
    cleanup_intermediate_rows(rows, max_depth)

    series = []
    for row_id, row in rows.items():
        s = pd.Series(row.values(), index=row.keys())
        series.append(s)

    return pd.DataFrame(series)

main_df = None
dfs = []
for record in [recorda, recordb, recordc, recordd]:
    df = dataframe_for_record(record)
    dfs.append(df)
    from pprint import pprint as pp ; pp(df)
    if main_df is None:
        main_df = df
    else:
        main_df = pd.concat([main_df, df], join='outer', ignore_index=True)
print('############')
# main_df = main_df.dropna(axis=1)
from pprint import pprint as pp ; pp(main_df)
	#!/usr/bin/env python
	# -- coding: utf-8 --
	# vim: ai ts=4 sts=4 et sw=4 nu

	import uuid
	import copy
	from collections import OrderedDict

	import pandas as pd

	recorda = {u'_attachments': [],
	u'_bamboo_dataset_id': u'',
	u'_deleted_at': None,
	u'_geolocation': [None, None],
	u'_id': 438852,
	u'_status': u'submitted_via_web',
	u'_submission_time': u'2012-10-26T10:56:30',
	u'_uuid': u'427144a25cff4f3c8e76c041edaef979',
	u'_xform_id_string': u'test_repeat_multiples',
	u'age': u'32',
	u'formhub/uuid': u'9ca652729404404a8e3dbed596332420',
	u'hohh': u'reg',
	u'wifes': [{u'wifes/children': [{u'wifes/children/age': u'3',
	u'wifes/children/name': u'Salim'},
	{u'wifes/children/age': u'1', u'wifes/children/name': u'Soska'},
	{u'wifes/children/age': u'5', u'wifes/children/name': u'Fjj'}],
	u'wifes/wife_name': u'Goundo'},
	{u'wifes/children': [{u'wifes/children/age': u'7',
	u'wifes/children/name': u'Abdp'},
	{u'wifes/children/age': u'2', u'wifes/children/name': u'Rokia'}],
	u'wifes/wife_name': u'Haja'}]}

	recordb = {u'_attachments': [],
	u'_bamboo_dataset_id': u'',
	u'_deleted_at': None,
	u'_geolocation': [None, None],
	u'_id': 438855,
	u'_status': u'submitted_via_web',
	u'_submission_time': u'2012-10-29T06:44:43',
	u'_uuid': u'04d3170da13341b5bcdd4bcfe58b5193',
	u'_xform_id_string': u'test_repeat_multiples',
	u'age': u'25',
	u'formhub/uuid': u'9ca652729404404a8e3dbed596332420',
	u'hohh': u'Gaz',
	u'wifes': [{u'wifes/wife_name': u'Ryi'},
	{u'wifes/children': [{u'wifes/children/age': u'2',
	u'wifes/children/name': u'Tre'},
	{u'wifes/children/age': u'5',
	u'wifes/children/name': u'Cfe'}],
	u'wifes/wife_name': u'Hyo'}]}

	recordc = {u'_attachments': [],
	u'_bamboo_dataset_id': u'',
	u'_deleted_at': None,
	u'_geolocation': [None, None],
	u'_id': 438854,
	u'_status': u'submitted_via_web',
	u'_submission_time': u'2012-10-29T06:44:42',
	u'_uuid': u'0ac9d53235a2495bbf579adda47c599f',
	u'_xform_id_string': u'test_repeat_multiples',
	u'age': u'57',
	u'formhub/uuid': u'9ca652729404404a8e3dbed596332420',
	u'hohh': u'tip'}

	recordd = {u'_attachments': [],
	u'_bamboo_dataset_id': u'',
	u'_deleted_at': None,
	u'_geolocation': [None, None],
	u'_id': 438853,
	u'_status': u'submitted_via_web',
	u'_submission_time': u'2012-10-26T10:58:36',
	u'_uuid': u'5ecefec6b9d44247949b0b1637642731',
	u'_xform_id_string': u'test_repeat_multiples',
	u'age': u'37',
	u'formhub/uuid': u'9ca652729404404a8e3dbed596332420',
	u'hohh': u'Ali',
	u'wifes': [{u'wifes/children': [{u'wifes/children/age': u'1',
	u'wifes/children/name': u'Chya'}],
	u'wifes/wife_name': u'Ba'}]}









	recorda = {u'_attachments': [],
	u'_bamboo_dataset_id': u'',
	u'_deleted_at': None,
	u'_geolocation': [None, None],
	u'_id': 438858,
	u'_status': u'submitted_via_web',
	u'_submission_time': u'2012-10-29T08:46:15',
	u'_uuid': u'99e029809e2b41919e27ca900cfb7377',
	u'_xform_id_string': u'test_double_repeat_multiples',
	u'age': u'26',
	u'browser': [{u'browser/name': u'ie'},
	{u'browser/name': u'firefox'},
	{u'browser/name': u'chrome'},
	{u'browser/name': u'chromium'}],
	u'formhub/uuid': u'08230bc26da247c1bb2a9988016dfdcb',
	u'hohh': u'Fad',
	u'wifes': [{u'wifes/children': [{u'wifes/children/age': u'1',
	u'wifes/children/name': u'Assi'}],
	u'wifes/wife_name': u'Soum'}]}


	recordb = {u'_attachments': [],
	u'_bamboo_dataset_id': u'',
	u'_deleted_at': None,
	u'_geolocation': [None, None],
	u'_id': 438856,
	u'_status': u'submitted_via_web',
	u'_submission_time': u'2012-10-29T08:46:14',
	u'_uuid': u'007487526a7c45cfa1d44025057e434d',
	u'_xform_id_string': u'test_double_repeat_multiples',
	u'age': u'10',
	u'browser': [{u'browser/name': u'firefox'}, {u'browser/name': u'chromium'}],
	u'formhub/uuid': u'08230bc26da247c1bb2a9988016dfdcb',
	u'hohh': u'Reg',
	u'wifes': [{u'wifes/children': [{u'wifes/children/age': u'3',
	u'wifes/children/name': u'Salim'},
	{u'wifes/children/age': u'1',
	u'wifes/children/name': u'Siska'}],
	u'wifes/wife_name': u'Goun'},
	{u'wifes/children': [{u'wifes/children/age': u'4',
	u'wifes/children/name': u'Kadi'}],
	u'wifes/wife_name': u'Aja'},
	{u'wifes/wife_name': u'Awa'}]}
	recordc = {u'_attachments': [],
	u'_bamboo_dataset_id': u'',
	u'_deleted_at': None,
	u'_geolocation': [None, None],
	u'_id': 438857,
	u'_status': u'submitted_via_web',
	u'_submission_time': u'2012-10-29T08:46:15',
	u'_uuid': u'465ab1843dd84563a076c68b34e26208',
	u'_xform_id_string': u'test_double_repeat_multiples',
	u'age': u'35',
	u'formhub/uuid': u'08230bc26da247c1bb2a9988016dfdcb',
	u'hohh': u'Ali',
	u'wifes': [{u'wifes/children': [{u'wifes/children/age': u'1',
	u'wifes/children/name': u'Chya'}],
	u'wifes/wife_name': u'Ba'}]}

	recordd = {u'_attachments': [],
	u'_bamboo_dataset_id': u'',
	u'_deleted_at': None,
	u'_geolocation': [None, None],
	u'_id': 438859,
	u'_status': u'submitted_via_web',
	u'_submission_time': u'2012-10-29T08:46:16',
	u'_uuid': u'a06126d70b454915971a8d196e187e8a',
	u'_xform_id_string': u'test_double_repeat_multiples',
	u'age': u'27',
	u'browser': [{u'browser/name': u'firefox'}],
	u'formhub/uuid': u'08230bc26da247c1bb2a9988016dfdcb',
	u'hohh': u'Alou',
	u'wifes': [{u'wifes/wife_name': u'Kadi'}]}

	columns = OrderedDict([(u'formhub/uuid', None),
	(u'_uuid', None),
	(u'hohh', None),
	(u'age', None),
	(u'wifes', []),
	(u'wifes/wife_name', None),
	(u'wifes/children/name', None),
	(u'wifes/children/age', None)])


	columns = OrderedDict([(u'formhub/uuid', None),
	(u'_uuid', None),
	(u'hohh', None), (u'age', None),
	(u'wifes', []), (u'wifes/wife_name', None),
	(u'wifes/children', []),
	(u'wifes/children/name', None),
	(u'wifes/children/age', None),
	(u'browser', []),
	(u'browser/name', None)])


	# build a list of repeating sections and pop them out of columns
	repeat_columns = [column if columns.pop(column, column) else column
	for column, column_type in columns.items()
	if isinstance(column_type, list)]

	from pprint import pprint as pp ; pp(repeat_columns)
	from pprint import pprint as pp ; pp(columns)


	def dataframe_for_record(record):

	rows = {}

	def create_new_row(data, record, columns, depth):

	row_id = uuid.uuid4().hex
	row = copy.copy(data)
	row.update({'depth': depth})

	for column in columns:
	if column in record:
	row.update({column: record.get(column)})

	if row_id in rows.keys():
	return rows.get(row_id)

	rows.update({row_id: row})
	return row

	def update_row(row, key, value):
	row.update({key: value})

	def traverse_repeat_sections(iterobj, columns, row, record, depth):

	if isinstance(iterobj, dict):
	for key, value in iterobj.items():
	if key in columns:
	# we have a wanted data
	# we know it's not a list
	update_row(row, key, value)
	for key, value in iterobj.items():
	if isinstance(value, (list, dict)):
	# somes values are dict/list.
	# might contain wanted data
	if isinstance(value, list):
	d = depth + 1
	else:
	d = depth
	traverse_repeat_sections(value, columns, row, record, d)

	elif isinstance(iterobj, list):
	for item in iterobj:
	# create a row for each
	if row.get('depth') != depth:
	nrow = create_new_row(row, record, columns, depth)
	else:
	nrow = row
	traverse_repeat_sections(item, columns, nrow, record, depth + 1)
	else:
	print('??? %s' % iterobj)

	# construct rows for each repeat sections
	for repeat_section in repeat_columns:
	traverse_repeat_sections(record.get(repeat_section),
	columns.keys(), {}, record, 0)

	# if we have no rows yet (no repeat sections), let's build a simpe one.
	if not len(rows):
	create_new_row({}, record, columns, 0)

	def cleanup_intermediate_rows(rows, max_depth):
	ids = list(rows.keys())
	for row_id in ids:
	del(rows[row_id]['depth'])

	# get max depth
	max_depth = max([d.get('depth', 0) for d in rows.values()])

	# remove depth column and intermediate rows
	cleanup_intermediate_rows(rows, max_depth)

	series = []
	for row_id, row in rows.items():
	s = pd.Series(row.values(), index=row.keys())
	series.append(s)

	return pd.DataFrame(series)

	main_df = None
	dfs = []
	for record in [recorda, recordb, recordc, recordd]:
	df = dataframe_for_record(record)
	dfs.append(df)
	from pprint import pprint as pp ; pp(df)
	if main_df is None:
	main_df = df
	else:
	main_df = pd.concat([main_df, df], join='outer', ignore_index=True)
	print('############')
	# main_df = main_df.dropna(axis=1)
	from pprint import pprint as pp ; pp(main_df)