Skip to content

Instantly share code, notes, and snippets.

@TAJD
Last active July 3, 2021 17:36
Show Gist options
  • Save TAJD/9b30c92d12b0908781d2a39aba47237b to your computer and use it in GitHub Desktop.
Save TAJD/9b30c92d12b0908781d2a39aba47237b to your computer and use it in GitHub Desktop.
Useful snippets for working with pandas dataframes
import numpy as np
import pandas as pd
def check_if_integer(col: pd.Series) -> bool:
"""Check if changing the data type of a series to integer changes its values."""
return np.array_equal(col, col.astype(int))
def reduce_mem_usage(
df: pd.DataFrame,
int_cast: bool = True,
obj_to_category: bool = False,
subset: bool = None,
) -> pd.DataFrame:
"""
Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
:param df: dataframe to reduce (pd.DataFrame)
:param int_cast: indicate if columns should be tried to be casted to int (bool)
:param obj_to_category: convert non-datetime related objects to category dtype (bool)
:param subset: subset of columns to analyse (list)
:return: dataframe with the column dtypes adjusted (pd.DataFrame)
"""
start_mem = df.memory_usage().sum() / 1024 ** 2
print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
cols = subset if subset is not None else df.columns.tolist()
for col in cols:
col_type = df[col].dtype
if (
col_type != object
and col_type.name != "category"
and "datetime" not in col_type.name
):
c_min = df[col].min()
c_max = df[col].max()
# test if column can be converted to an integer
treat_as_int = str(col_type)[:3] == "int"
if int_cast and not treat_as_int:
treat_as_int = check_if_integer(df[col])
if treat_as_int:
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
df[col] = df[col].astype(np.uint8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif (
c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max
):
df[col] = df[col].astype(np.uint16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif (
c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max
):
df[col] = df[col].astype(np.uint32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
elif (
c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max
):
df[col] = df[col].astype(np.uint64)
else:
if (
c_min > np.finfo(np.float16).min
and c_max < np.finfo(np.float16).max
):
df[col] = df[col].astype(np.float16)
elif (
c_min > np.finfo(np.float32).min
and c_max < np.finfo(np.float32).max
):
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
elif "datetime" not in col_type.name and obj_to_category:
df[col] = df[col].astype("category")
end_mem = df.memory_usage().sum() / 1024 ** 2
print("Memory usage after optimization is: {:.3f} MB".format(end_mem))
print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
return df
# example usage
df = pd.DataFrame(
{
"a": pd.Series(np.arange(1000), dtype="int64"),
"b": pd.Series(np.arange(1000.0), dtype="float64"),
}
)
df_compressed = reduce_mem_usage(df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment