Skip to content

Instantly share code, notes, and snippets.

@BibMartin
Last active May 9, 2017 12:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save BibMartin/b0219727266515fa2af059df7f75b967 to your computer and use it in GitHub Desktop.
Save BibMartin/b0219727266515fa2af059df7f75b967 to your computer and use it in GitHub Desktop.
Hack pandas.DataFrame to have unstacked JSON structure
from pandas import DataFrame, MultiIndex
def __getattribute__(self, x):
try:
return _parent__getattribute__(self, x)
except AttributeError:
columns = _parent__getattribute__(self, 'columns')
cols = list(set([x.split('.')[0] for x in columns]))
if x in cols:
_prefix = x + '.'
df = self[[x for x in columns if x.startswith(_prefix)]].rename_axis(
lambda x: x[len(_prefix):], axis=1)
if len(_parent__getattribute__(df, 'columns')):
return df
else:
return self[x]
else:
raise ValueError('{} not in {}'.format(x, cols))
_parent__getattribute__ = DataFrame.__getattribute__
DataFrame.__getattribute__ = __getattribute__
def __dir__(self):
cols = list(set([x.split('.')[0] for x in self.columns]))
return _parent__dir__(self) + list(cols)
_parent__dir__ = DataFrame.__dir__
DataFrame.__dir__ = __dir__
def unstack(x, prefix=""):
if isinstance(x, dict):
out = {}
for key, val in x.items():
z = unstack(val, prefix=key+'.')
if isinstance(z, dict):
for subkey, subval in z.items():
out[prefix+subkey] = subval
else:
out[prefix+key] = val
return out
elif isinstance(x, list):
return unstack({'_'+str(i): val for i,val in enumerate(x)}, prefix=prefix)
else:
return x
def unwind(self):
columns = MultiIndex.from_tuples([tuple(col.split('.', 1))
for col in self.columns])
return DataFrame(self.values,
index=self.index,
columns=columns)
DataFrame.unwind = unwind
# Example
#########
data = [{'state': 'Florida',
'shortname': 'FL',
'info': {
'governor': 'Rick Scott'
},
'counties': [{'name': 'Dade', 'population': 12345},
{'name': 'Broward', 'population': 40000},
{'name': 'Palm Beach', 'population': 60000}]},
{'state': 'Ohio',
'shortname': 'OH',
'info': {
'governor': 'John Kasich'
},
'counties': [{'name': 'Summit', 'population': 1234},
{'name': 'Cuyahoga', 'population': 1337}]}]
df = DataFrame([unstack(x) for x in data])
print(df.counties._0)
print(df.counties.unwind().stack(0))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment