Skip to content

Instantly share code, notes, and snippets.

@johnny-godoy
Last active April 1, 2023 18:30
Show Gist options
  • Save johnny-godoy/46979f47c3c9b261744da93ec020fa68 to your computer and use it in GitHub Desktop.
Save johnny-godoy/46979f47c3c9b261744da93ec020fa68 to your computer and use it in GitHub Desktop.
Pandas DataFrame Compression
"""Implement the compress_dataframe function."""
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
if TYPE_CHECKING:
import pandas as pd
from pandas.core.indexes.base import Index
def compress_dataframe(
frame: pd.DataFrame,
*,
inplace: bool = False,
columns: Index = None,
) -> pd.DataFrame:
"""Create a compressed dataframe by downcasting,
and changing objects to categories if it lowers memory usage.
Note that this function is vectorized and thus optimized for many columns.
Parameters
----------
frame: pd.DataFrame
The dataframe to be compressed.
inplace: bool, default False
If False, return a copy. Otherwise, update the input dataframe.
columns: single label or list-like, optional
A list of columns, or a single label which are to be compressed.
All columns which aren't counted will not be changed.
If unspecified, all columns will be compressed.
Returns
-------
return_frame: pd.DataFrame
The compressed dataframe.
Gist
----
https://gist.github.com/johnny-godoy/46979f47c3c9b261744da93ec020fa68"""
if columns is None:
columns = frame.columns
result_frame = frame[columns].copy()
# Getting min/max for every numeric column
numerics = result_frame.select_dtypes(["int", "float"])
c_min, c_max = numerics.min(), numerics.max()
# Processing integers
ints = numerics.select_dtypes("int")
int_columns = ints.columns
c_min_int, c_max_int = c_min[int_columns], c_max[int_columns]
int8_cols = (c_min_int > np.iinfo(np.int8).min) & (
c_max_int < np.iinfo(np.int8).max
)
cols = int_columns[int8_cols]
result_frame[cols] = ints[cols].astype(np.int8)
c_min_int16, c_max_int16 = c_min_int[~int8_cols], c_max_int[~int8_cols]
int16_cols = (c_min_int16 > np.iinfo(np.int16).min) & (
c_max_int16 < np.iinfo(np.int16).max
)
cols = c_min_int16[int16_cols].index
result_frame[cols] = ints[cols].astype(np.int16)
c_min_int32, c_max_int32 = c_min_int16[~int16_cols], c_max_int16[~int16_cols]
int32_cols = (c_min_int32 > np.iinfo(np.int32).min) & (
c_max_int32 < np.iinfo(np.int32).max
)
cols = c_min_int32[int32_cols].index
result_frame[cols] = ints[cols].astype(np.int32)
# Processing floats
floats = numerics.select_dtypes("float")
float_columns = floats.columns
c_min_float, c_max_float = c_min[float_columns], c_max[float_columns]
float16_cols = (c_min_float > np.finfo(np.float16).min) & (
c_max_float < np.finfo(np.float16).max
)
cols = float_columns[float16_cols]
result_frame[cols] = floats[cols].astype(np.float16)
c_min_float32, c_max_float32 = (
c_min_float[~float16_cols],
c_max_float[~float16_cols],
)
float32_cols = (c_min_float32 > np.finfo(np.float32).min) & (
c_max_float32 < np.finfo(np.float32).max
)
cols = c_min_float32[float32_cols].index
result_frame[cols] = floats[cols].astype(np.float32)
# Processing objects
objects = result_frame.select_dtypes("object")
object_columns = objects.columns
categories = result_frame[object_columns].astype("category")
to_change = object_columns[
categories.memory_usage(index=False) < objects.memory_usage(index=False)
]
result_frame[to_change] = categories[to_change]
if inplace:
# noinspection PyProtectedMember
frame._update_inplace(result_frame)
return result_frame
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment