Last active
July 3, 2021 17:36
-
-
Save TAJD/9b30c92d12b0908781d2a39aba47237b to your computer and use it in GitHub Desktop.
Useful snippets for working with pandas dataframes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
def check_if_integer(col: pd.Series) -> bool: | |
"""Check if changing the data type of a series to integer changes its values.""" | |
return np.array_equal(col, col.astype(int)) | |
def reduce_mem_usage( | |
df: pd.DataFrame, | |
int_cast: bool = True, | |
obj_to_category: bool = False, | |
subset: bool = None, | |
) -> pd.DataFrame: | |
""" | |
Iterate through all the columns of a dataframe and modify the data type to reduce memory usage. | |
:param df: dataframe to reduce (pd.DataFrame) | |
:param int_cast: indicate if columns should be tried to be casted to int (bool) | |
:param obj_to_category: convert non-datetime related objects to category dtype (bool) | |
:param subset: subset of columns to analyse (list) | |
:return: dataframe with the column dtypes adjusted (pd.DataFrame) | |
""" | |
start_mem = df.memory_usage().sum() / 1024 ** 2 | |
print("Memory usage of dataframe is {:.2f} MB".format(start_mem)) | |
cols = subset if subset is not None else df.columns.tolist() | |
for col in cols: | |
col_type = df[col].dtype | |
if ( | |
col_type != object | |
and col_type.name != "category" | |
and "datetime" not in col_type.name | |
): | |
c_min = df[col].min() | |
c_max = df[col].max() | |
# test if column can be converted to an integer | |
treat_as_int = str(col_type)[:3] == "int" | |
if int_cast and not treat_as_int: | |
treat_as_int = check_if_integer(df[col]) | |
if treat_as_int: | |
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: | |
df[col] = df[col].astype(np.int8) | |
elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max: | |
df[col] = df[col].astype(np.uint8) | |
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: | |
df[col] = df[col].astype(np.int16) | |
elif ( | |
c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max | |
): | |
df[col] = df[col].astype(np.uint16) | |
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: | |
df[col] = df[col].astype(np.int32) | |
elif ( | |
c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max | |
): | |
df[col] = df[col].astype(np.uint32) | |
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: | |
df[col] = df[col].astype(np.int64) | |
elif ( | |
c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max | |
): | |
df[col] = df[col].astype(np.uint64) | |
else: | |
if ( | |
c_min > np.finfo(np.float16).min | |
and c_max < np.finfo(np.float16).max | |
): | |
df[col] = df[col].astype(np.float16) | |
elif ( | |
c_min > np.finfo(np.float32).min | |
and c_max < np.finfo(np.float32).max | |
): | |
df[col] = df[col].astype(np.float32) | |
else: | |
df[col] = df[col].astype(np.float64) | |
elif "datetime" not in col_type.name and obj_to_category: | |
df[col] = df[col].astype("category") | |
end_mem = df.memory_usage().sum() / 1024 ** 2 | |
print("Memory usage after optimization is: {:.3f} MB".format(end_mem)) | |
print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem)) | |
return df | |
# example usage | |
df = pd.DataFrame( | |
{ | |
"a": pd.Series(np.arange(1000), dtype="int64"), | |
"b": pd.Series(np.arange(1000.0), dtype="float64"), | |
} | |
) | |
df_compressed = reduce_mem_usage(df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment