Skip to content

Instantly share code, notes, and snippets.

@do-me
Created May 20, 2024 08:45
Show Gist options
  • Save do-me/f9e1759f18b45aa82a8dd048f5000d74 to your computer and use it in GitHub Desktop.
Save do-me/f9e1759f18b45aa82a8dd048f5000d74 to your computer and use it in GitHub Desktop.
Pandas custom parquet save with pickle for list of lists
import pandas as pd
import pickle
def write_pd_pickle(df, filename, pickle_cols=None):
"""
Writes a pandas DataFrame to a Parquet file, pickling specified columns.
The function takes a DataFrame and pickles the specified columns before saving
the DataFrame to a Parquet file. This is useful for saving columns that contain
data types that Parquet might not natively support, such as lists or dictionaries.
Parameters:
- df: pandas DataFrame to be written.
- filename: the name of the output Parquet file.
- pickle_cols: a list of column names in the DataFrame to be pickled.
Returns:
- A list of pickled column names.
Raises:
- ValueError: if pickle_cols is not provided or a column listed in pickle_cols is not in the DataFrame.
Example:
>>> df_auto = pd.DataFrame({'test_list': [[1, 2], [3, 4]], 'list_of_lists': [[[1], [2]], [[3], [4]]]})
>>> write_pd_pickle(df_auto, "test.parquet", ["test_list", "list_of_lists"])
['test_list', 'list_of_lists']
"""
if pickle_cols is None:
raise ValueError("pickle_cols must be a list of column names.")
df_to_write = df.copy()
# Pickle specified columns
for col in pickle_cols:
if col not in df_to_write.columns:
raise ValueError(f"Column '{col}' not found in the DataFrame.")
df_to_write[col] = df_to_write[col].apply(pickle.dumps)
# Write DataFrame to Parquet file
df_to_write.to_parquet(filename, index=False)
return pickle_cols
# write_pd_pickle(df_auto, "test.parquet", ["test_list","list_of_lists"]
def is_pickled(column_sample):
try:
pickle.loads(column_sample)
return True
except Exception:
return False
def read_pd_unpickle(filename, unpickle_cols=None):
"""
Reads a Parquet file into a pandas DataFrame, unpickling specified columns.
The function reads a Parquet file into a DataFrame and unpickles specified columns.
If unpickle_cols is not provided, it will attempt to auto-detect pickled columns.
Parameters:
- filename: the name of the Parquet file to read.
- unpickle_cols: a list of column names to be unpickled. If None, auto-detection is used.
Returns:
- A pandas DataFrame with specified columns unpickled.
Raises:
- ValueError: if unpickle_cols is not a list or a column listed in unpickle_cols is not in the DataFrame.
Examples:
Auto-detection mode:
>>> df_auto = read_pd_unpickle("test.parquet")
>>> df_auto
test_list list_of_lists
0 [1, 2] [[1], [2]]
1 [3, 4] [[3], [4]]
Manual mode, specifying columns to unpickle:
>>> df_manual = read_pd_unpickle("test.parquet", unpickle_cols=["test_list", "list_of_lists"])
>>> df_manual
test_list list_of_lists
0 [1, 2] [[1], [2]]
1 [3, 4] [[3], [4]]
Quick check if the two DataFrames are the same:
>>> str(df_auto) == str(df_manual)
True
"""
df = pd.read_parquet(filename)
if unpickle_cols is None:
# Auto-detection mode
unpickle_cols = []
for col in df.columns:
non_null_items = df[col].dropna()
if len(non_null_items) > 0:
sample = non_null_items.iloc[0]
if is_pickled(sample):
unpickle_cols.append(col)
elif not isinstance(unpickle_cols, list):
raise ValueError("unpickle_cols must be a list of column names.")
# Unpickle specified columns
for col in unpickle_cols:
if col in df.columns:
df[col] = df[col].apply(lambda x: pickle.loads(x) if pd.notnull(x) else x)
else:
raise ValueError(f"Column '{col}' not found in the DataFrame.")
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment