do-me/pandas_pickle.py

## pandas_pickle.py
import pandas as pd
import pickle

def write_pd_pickle(df, filename, pickle_cols=None):
    """
    Writes a pandas DataFrame to a Parquet file, pickling specified columns.

    The function takes a DataFrame and pickles the specified columns before saving
    the DataFrame to a Parquet file. This is useful for saving columns that contain
    data types that Parquet might not natively support, such as lists or dictionaries.

    Parameters:
    - df: pandas DataFrame to be written.
    - filename: the name of the output Parquet file.
    - pickle_cols: a list of column names in the DataFrame to be pickled.

    Returns:
    - A list of pickled column names.

    Raises:
    - ValueError: if pickle_cols is not provided or a column listed in pickle_cols is not in the DataFrame.

    Example:
    >>> df_auto = pd.DataFrame({'test_list': [[1, 2], [3, 4]], 'list_of_lists': [[[1], [2]], [[3], [4]]]})
    >>> write_pd_pickle(df_auto, "test.parquet", ["test_list", "list_of_lists"])
    ['test_list', 'list_of_lists']
    """
    if pickle_cols is None:
        raise ValueError("pickle_cols must be a list of column names.")

    df_to_write = df.copy()

    # Pickle specified columns
    for col in pickle_cols:
        if col not in df_to_write.columns:
            raise ValueError(f"Column '{col}' not found in the DataFrame.")
        df_to_write[col] = df_to_write[col].apply(pickle.dumps)

    # Write DataFrame to Parquet file
    df_to_write.to_parquet(filename, index=False)

    return pickle_cols

  # write_pd_pickle(df_auto, "test.parquet", ["test_list","list_of_lists"]


def is_pickled(column_sample):
    try:
        pickle.loads(column_sample)
        return True
    except Exception:
        return False

def read_pd_unpickle(filename, unpickle_cols=None):
    """
    Reads a Parquet file into a pandas DataFrame, unpickling specified columns.

    The function reads a Parquet file into a DataFrame and unpickles specified columns.
    If unpickle_cols is not provided, it will attempt to auto-detect pickled columns.

    Parameters:
    - filename: the name of the Parquet file to read.
    - unpickle_cols: a list of column names to be unpickled. If None, auto-detection is used.

    Returns:
    - A pandas DataFrame with specified columns unpickled.

    Raises:
    - ValueError: if unpickle_cols is not a list or a column listed in unpickle_cols is not in the DataFrame.

    Examples:
    Auto-detection mode:
    >>> df_auto = read_pd_unpickle("test.parquet")
    >>> df_auto
      test_list list_of_lists
    0    [1, 2]       [[1], [2]]
    1    [3, 4]       [[3], [4]]

    Manual mode, specifying columns to unpickle:
    >>> df_manual = read_pd_unpickle("test.parquet", unpickle_cols=["test_list", "list_of_lists"])
    >>> df_manual
      test_list list_of_lists
    0    [1, 2]       [[1], [2]]
    1    [3, 4]       [[3], [4]]

    Quick check if the two DataFrames are the same:
    >>> str(df_auto) == str(df_manual)
    True
    """
    df = pd.read_parquet(filename)

    if unpickle_cols is None:
        # Auto-detection mode
        unpickle_cols = []
        for col in df.columns:
            non_null_items = df[col].dropna()
            if len(non_null_items) > 0:
                sample = non_null_items.iloc[0]
                if is_pickled(sample):
                    unpickle_cols.append(col)
    elif not isinstance(unpickle_cols, list):
        raise ValueError("unpickle_cols must be a list of column names.")

    # Unpickle specified columns
    for col in unpickle_cols:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: pickle.loads(x) if pd.notnull(x) else x)
        else:
            raise ValueError(f"Column '{col}' not found in the DataFrame.")

    return df
	import pandas as pd
	import pickle

	def write_pd_pickle(df, filename, pickle_cols=None):
	"""
	Writes a pandas DataFrame to a Parquet file, pickling specified columns.

	The function takes a DataFrame and pickles the specified columns before saving
	the DataFrame to a Parquet file. This is useful for saving columns that contain
	data types that Parquet might not natively support, such as lists or dictionaries.

	Parameters:
	- df: pandas DataFrame to be written.
	- filename: the name of the output Parquet file.
	- pickle_cols: a list of column names in the DataFrame to be pickled.

	Returns:
	- A list of pickled column names.

	Raises:
	- ValueError: if pickle_cols is not provided or a column listed in pickle_cols is not in the DataFrame.

	Example:
	>>> df_auto = pd.DataFrame({'test_list': [[1, 2], [3, 4]], 'list_of_lists': [[[1], [2]], [[3], [4]]]})
	>>> write_pd_pickle(df_auto, "test.parquet", ["test_list", "list_of_lists"])
	['test_list', 'list_of_lists']
	"""
	if pickle_cols is None:
	raise ValueError("pickle_cols must be a list of column names.")

	df_to_write = df.copy()

	# Pickle specified columns
	for col in pickle_cols:
	if col not in df_to_write.columns:
	raise ValueError(f"Column '{col}' not found in the DataFrame.")
	df_to_write[col] = df_to_write[col].apply(pickle.dumps)

	# Write DataFrame to Parquet file
	df_to_write.to_parquet(filename, index=False)

	return pickle_cols

	# write_pd_pickle(df_auto, "test.parquet", ["test_list","list_of_lists"]



	def is_pickled(column_sample):
	try:
	pickle.loads(column_sample)
	return True
	except Exception:
	return False

	def read_pd_unpickle(filename, unpickle_cols=None):
	"""
	Reads a Parquet file into a pandas DataFrame, unpickling specified columns.

	The function reads a Parquet file into a DataFrame and unpickles specified columns.
	If unpickle_cols is not provided, it will attempt to auto-detect pickled columns.

	Parameters:
	- filename: the name of the Parquet file to read.
	- unpickle_cols: a list of column names to be unpickled. If None, auto-detection is used.

	Returns:
	- A pandas DataFrame with specified columns unpickled.

	Raises:
	- ValueError: if unpickle_cols is not a list or a column listed in unpickle_cols is not in the DataFrame.

	Examples:
	Auto-detection mode:
	>>> df_auto = read_pd_unpickle("test.parquet")
	>>> df_auto
	test_list list_of_lists
	0 [1, 2] [[1], [2]]
	1 [3, 4] [[3], [4]]

	Manual mode, specifying columns to unpickle:
	>>> df_manual = read_pd_unpickle("test.parquet", unpickle_cols=["test_list", "list_of_lists"])
	>>> df_manual
	test_list list_of_lists
	0 [1, 2] [[1], [2]]
	1 [3, 4] [[3], [4]]

	Quick check if the two DataFrames are the same:
	>>> str(df_auto) == str(df_manual)
	True
	"""
	df = pd.read_parquet(filename)

	if unpickle_cols is None:
	# Auto-detection mode
	unpickle_cols = []
	for col in df.columns:
	non_null_items = df[col].dropna()
	if len(non_null_items) > 0:
	sample = non_null_items.iloc[0]
	if is_pickled(sample):
	unpickle_cols.append(col)
	elif not isinstance(unpickle_cols, list):
	raise ValueError("unpickle_cols must be a list of column names.")

	# Unpickle specified columns
	for col in unpickle_cols:
	if col in df.columns:
	df[col] = df[col].apply(lambda x: pickle.loads(x) if pd.notnull(x) else x)
	else:
	raise ValueError(f"Column '{col}' not found in the DataFrame.")

	return df