Created
May 20, 2024 08:45
-
-
Save do-me/f9e1759f18b45aa82a8dd048f5000d74 to your computer and use it in GitHub Desktop.
Pandas custom parquet save with pickle for list of lists
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import pickle | |
def write_pd_pickle(df, filename, pickle_cols=None): | |
""" | |
Writes a pandas DataFrame to a Parquet file, pickling specified columns. | |
The function takes a DataFrame and pickles the specified columns before saving | |
the DataFrame to a Parquet file. This is useful for saving columns that contain | |
data types that Parquet might not natively support, such as lists or dictionaries. | |
Parameters: | |
- df: pandas DataFrame to be written. | |
- filename: the name of the output Parquet file. | |
- pickle_cols: a list of column names in the DataFrame to be pickled. | |
Returns: | |
- A list of pickled column names. | |
Raises: | |
- ValueError: if pickle_cols is not provided or a column listed in pickle_cols is not in the DataFrame. | |
Example: | |
>>> df_auto = pd.DataFrame({'test_list': [[1, 2], [3, 4]], 'list_of_lists': [[[1], [2]], [[3], [4]]]}) | |
>>> write_pd_pickle(df_auto, "test.parquet", ["test_list", "list_of_lists"]) | |
['test_list', 'list_of_lists'] | |
""" | |
if pickle_cols is None: | |
raise ValueError("pickle_cols must be a list of column names.") | |
df_to_write = df.copy() | |
# Pickle specified columns | |
for col in pickle_cols: | |
if col not in df_to_write.columns: | |
raise ValueError(f"Column '{col}' not found in the DataFrame.") | |
df_to_write[col] = df_to_write[col].apply(pickle.dumps) | |
# Write DataFrame to Parquet file | |
df_to_write.to_parquet(filename, index=False) | |
return pickle_cols | |
# write_pd_pickle(df_auto, "test.parquet", ["test_list","list_of_lists"] | |
def is_pickled(column_sample): | |
try: | |
pickle.loads(column_sample) | |
return True | |
except Exception: | |
return False | |
def read_pd_unpickle(filename, unpickle_cols=None): | |
""" | |
Reads a Parquet file into a pandas DataFrame, unpickling specified columns. | |
The function reads a Parquet file into a DataFrame and unpickles specified columns. | |
If unpickle_cols is not provided, it will attempt to auto-detect pickled columns. | |
Parameters: | |
- filename: the name of the Parquet file to read. | |
- unpickle_cols: a list of column names to be unpickled. If None, auto-detection is used. | |
Returns: | |
- A pandas DataFrame with specified columns unpickled. | |
Raises: | |
- ValueError: if unpickle_cols is not a list or a column listed in unpickle_cols is not in the DataFrame. | |
Examples: | |
Auto-detection mode: | |
>>> df_auto = read_pd_unpickle("test.parquet") | |
>>> df_auto | |
test_list list_of_lists | |
0 [1, 2] [[1], [2]] | |
1 [3, 4] [[3], [4]] | |
Manual mode, specifying columns to unpickle: | |
>>> df_manual = read_pd_unpickle("test.parquet", unpickle_cols=["test_list", "list_of_lists"]) | |
>>> df_manual | |
test_list list_of_lists | |
0 [1, 2] [[1], [2]] | |
1 [3, 4] [[3], [4]] | |
Quick check if the two DataFrames are the same: | |
>>> str(df_auto) == str(df_manual) | |
True | |
""" | |
df = pd.read_parquet(filename) | |
if unpickle_cols is None: | |
# Auto-detection mode | |
unpickle_cols = [] | |
for col in df.columns: | |
non_null_items = df[col].dropna() | |
if len(non_null_items) > 0: | |
sample = non_null_items.iloc[0] | |
if is_pickled(sample): | |
unpickle_cols.append(col) | |
elif not isinstance(unpickle_cols, list): | |
raise ValueError("unpickle_cols must be a list of column names.") | |
# Unpickle specified columns | |
for col in unpickle_cols: | |
if col in df.columns: | |
df[col] = df[col].apply(lambda x: pickle.loads(x) if pd.notnull(x) else x) | |
else: | |
raise ValueError(f"Column '{col}' not found in the DataFrame.") | |
return df | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment