Skip to content

Instantly share code, notes, and snippets.

@CodeByAidan
Created July 2, 2024 13:16
Show Gist options
  • Save CodeByAidan/40855e18cfe25b17347abe51726276bb to your computer and use it in GitHub Desktop.
Save CodeByAidan/40855e18cfe25b17347abe51726276bb to your computer and use it in GitHub Desktop.
Preserve any size DataFrame, load/save - FAST!
from typing import Optional, TypeGuard
import pandas as pd
from pandas import DataFrame
class DataFrameStore:
"""
A class to store and manage a DataFrame, with the ability to save and load it to
and from a file in Feather format.
"""
def __init__(self, df: Optional[DataFrame] = None) -> None:
"""
Initialize the DataFrameStore with an optional DataFrame.
"""
self.df: Optional[DataFrame] = df
def save(self, file_path: str) -> None:
"""
Save the DataFrame to a file in Feather format, optimized for large datasets.
Example:
--------
>>> import pandas as pd
>>> df_sample = pd.DataFrame({'A': range(7119), 'B': range(7119, 14238)})
>>> store = DataFrameStore(df_sample)
>>> store.save('large_df.parquet')
DataFrame saved to 'large_df.parquet'
"""
if self.__is_df():
try:
self.df.to_feather(file_path)
print(f"DataFrame saved to '{file_path}'")
except Exception as e:
print(f"An error occurred while saving the DataFrame: {e}")
else:
print("No DataFrame to save!")
def load(self, file_path: str, columns: Optional[list] = None) -> None:
"""
Load the DataFrame from a Feather file, with an option to select specific columns for memory efficiency.
Note: Feather format does not support column selection directly during load.
Example:
--------
>>> import pandas as pd
>>> df_sample = pd.DataFrame({'A': range(7119), 'B': range(7119, 14238)})
>>> store = DataFrameStore()
>>> store.load('large_df.parquet')
DataFrame loaded from 'large_df.parquet'
>>> store.df
A B
0 0 7119
1 1 7120
2 2 7121
3 3 7122
4 4 7123
... ... ...
7114 7114 14233
7115 7115 14234
7116 7116 14235
7117 7117 14236
7118 7118 14237
[7119 rows x 2 columns]
>>> store.load('large_df.parquet', columns=['A'])
DataFrame loaded from 'large_df.parquet'
>>> store.df
A
0 0
1 1
2 2
3 3
4 4
... ...
7114 7114
7115 7115
7116 7116
7117 7117
7118 7118
[7119 rows x 1 columns]
"""
try:
df_temp: DataFrame = pd.read_feather(file_path)
if columns is not None:
self.df = df_temp.loc[:, df_temp.columns.intersection(columns)]
else:
self.df = df_temp
print(f"DataFrame loaded from '{file_path}'")
except Exception as e:
print(f"An error occurred while loading the DataFrame: {e}")
self.df = None
def __is_df(self) -> TypeGuard[DataFrame]:
"""
Check if the DataFrameStore has a DataFrame stored.
"""
return isinstance(self.df, pd.DataFrame)
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment