Last active
February 23, 2024 12:51
-
-
Save brisvag/ebab608cc0372b00c14eae693158e240 to your computer and use it in GitHub Desktop.
Utilities to easily go to and from nested recarrays DataFrames without sacrificing performance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
def rec2df(arr): | |
"""Unpack nested recarray into a flat dataframe.""" | |
def _unpack_recarray(arr, name=None): | |
names = arr.dtype.names | |
dct = {} | |
if names is None: | |
if arr.ndim == 1: | |
# normal column, just spit it out | |
dct[name] = arr | |
else: | |
# column with multidimensional data | |
# split it into indexed columns | |
for i in range(arr.shape[1]): | |
dct[f'{name}_col{i:02}'] = arr[:, i] | |
else: | |
# named subcolumns | |
for subname in names: | |
dct.update(_unpack_recarray(arr[subname], subname)) | |
return dct | |
return pd.DataFrame(_unpack_recarray(arr)) | |
def df2rec(df): | |
"""Repack a dataframe into a recarray.""" | |
cols = {} | |
for col, arr in df.items(): | |
if match := re.search(r'(.*)_col(\d+)$', col): | |
col = match.group(1) | |
idx = int(match.group(2)) | |
cols.setdefault(col, {})[idx] = arr | |
else: | |
cols[col] = arr | |
arrs = [] | |
dtypes = [] | |
for col, sub in cols.items(): | |
if isinstance(sub, dict): | |
arr = np.stack([v for _, v in sorted(sub.items())], axis=1) | |
dtype = (col, arr.dtype, (len(sub),)) | |
else: | |
arr = sub | |
dtype = (col, arr.dtype) | |
arrs.append(arr) | |
dtypes.append(dtype) | |
return np.rec.fromarrays(arrs, dtype=dtypes) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment