Skip to content

Instantly share code, notes, and snippets.

@brisvag
Last active February 23, 2024 12:51
Show Gist options
  • Save brisvag/ebab608cc0372b00c14eae693158e240 to your computer and use it in GitHub Desktop.
Save brisvag/ebab608cc0372b00c14eae693158e240 to your computer and use it in GitHub Desktop.
Utilities to easily go to and from nested recarrays DataFrames without sacrificing performance
import numpy as np
import pandas as pd
def rec2df(arr):
"""Unpack nested recarray into a flat dataframe."""
def _unpack_recarray(arr, name=None):
names = arr.dtype.names
dct = {}
if names is None:
if arr.ndim == 1:
# normal column, just spit it out
dct[name] = arr
else:
# column with multidimensional data
# split it into indexed columns
for i in range(arr.shape[1]):
dct[f'{name}_col{i:02}'] = arr[:, i]
else:
# named subcolumns
for subname in names:
dct.update(_unpack_recarray(arr[subname], subname))
return dct
return pd.DataFrame(_unpack_recarray(arr))
def df2rec(df):
"""Repack a dataframe into a recarray."""
cols = {}
for col, arr in df.items():
if match := re.search(r'(.*)_col(\d+)$', col):
col = match.group(1)
idx = int(match.group(2))
cols.setdefault(col, {})[idx] = arr
else:
cols[col] = arr
arrs = []
dtypes = []
for col, sub in cols.items():
if isinstance(sub, dict):
arr = np.stack([v for _, v in sorted(sub.items())], axis=1)
dtype = (col, arr.dtype, (len(sub),))
else:
arr = sub
dtype = (col, arr.dtype)
arrs.append(arr)
dtypes.append(dtype)
return np.rec.fromarrays(arrs, dtype=dtypes)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment