johnny-godoy/compress_dataframe.py

## compress_dataframe.py
"""Implement the compress_dataframe function."""
from __future__ import annotations

from typing import TYPE_CHECKING

import numpy as np

if TYPE_CHECKING:
    import pandas as pd
    from pandas.core.indexes.base import Index


def compress_dataframe(
    frame: pd.DataFrame,
    *,
    inplace: bool = False,
    columns: Index = None,
) -> pd.DataFrame:
    """Create a compressed dataframe by downcasting,
    and changing objects to categories if it lowers memory usage.
    Note that this function is vectorized and thus optimized for many columns.
    Parameters
    ----------
    frame: pd.DataFrame
        The dataframe to be compressed.
    inplace: bool, default False
        If False, return a copy. Otherwise, update the input dataframe.
    columns: single label or list-like, optional
        A list of columns, or a single label which are to be compressed.
        All columns which aren't counted will not be changed.
        If unspecified, all columns will be compressed.
    Returns
    -------
    return_frame: pd.DataFrame
        The compressed dataframe.
    Gist
    ----
    https://gist.github.com/johnny-godoy/46979f47c3c9b261744da93ec020fa68"""
    if columns is None:
        columns = frame.columns
    result_frame = frame[columns].copy()

    # Getting min/max for every numeric column
    numerics = result_frame.select_dtypes(["int", "float"])
    c_min, c_max = numerics.min(), numerics.max()

    # Processing integers
    ints = numerics.select_dtypes("int")
    int_columns = ints.columns

    c_min_int, c_max_int = c_min[int_columns], c_max[int_columns]
    int8_cols = (c_min_int > np.iinfo(np.int8).min) & (
        c_max_int < np.iinfo(np.int8).max
    )
    cols = int_columns[int8_cols]
    result_frame[cols] = ints[cols].astype(np.int8)

    c_min_int16, c_max_int16 = c_min_int[~int8_cols], c_max_int[~int8_cols]
    int16_cols = (c_min_int16 > np.iinfo(np.int16).min) & (
        c_max_int16 < np.iinfo(np.int16).max
    )
    cols = c_min_int16[int16_cols].index
    result_frame[cols] = ints[cols].astype(np.int16)

    c_min_int32, c_max_int32 = c_min_int16[~int16_cols], c_max_int16[~int16_cols]
    int32_cols = (c_min_int32 > np.iinfo(np.int32).min) & (
        c_max_int32 < np.iinfo(np.int32).max
    )
    cols = c_min_int32[int32_cols].index
    result_frame[cols] = ints[cols].astype(np.int32)

    # Processing floats
    floats = numerics.select_dtypes("float")
    float_columns = floats.columns

    c_min_float, c_max_float = c_min[float_columns], c_max[float_columns]
    float16_cols = (c_min_float > np.finfo(np.float16).min) & (
        c_max_float < np.finfo(np.float16).max
    )
    cols = float_columns[float16_cols]
    result_frame[cols] = floats[cols].astype(np.float16)

    c_min_float32, c_max_float32 = (
        c_min_float[~float16_cols],
        c_max_float[~float16_cols],
    )
    float32_cols = (c_min_float32 > np.finfo(np.float32).min) & (
        c_max_float32 < np.finfo(np.float32).max
    )
    cols = c_min_float32[float32_cols].index
    result_frame[cols] = floats[cols].astype(np.float32)

    # Processing objects
    objects = result_frame.select_dtypes("object")
    object_columns = objects.columns
    categories = result_frame[object_columns].astype("category")
    to_change = object_columns[
        categories.memory_usage(index=False) < objects.memory_usage(index=False)
    ]
    result_frame[to_change] = categories[to_change]

    if inplace:
        # noinspection PyProtectedMember
        frame._update_inplace(result_frame)
    return result_frame
	"""Implement the compress_dataframe function."""
	from __future__ import annotations

	from typing import TYPE_CHECKING

	import numpy as np

	if TYPE_CHECKING:
	import pandas as pd
	from pandas.core.indexes.base import Index


	def compress_dataframe(
	frame: pd.DataFrame,
	*,
	inplace: bool = False,
	columns: Index = None,
	) -> pd.DataFrame:
	"""Create a compressed dataframe by downcasting,
	and changing objects to categories if it lowers memory usage.
	Note that this function is vectorized and thus optimized for many columns.
	Parameters
	----------
	frame: pd.DataFrame
	The dataframe to be compressed.
	inplace: bool, default False
	If False, return a copy. Otherwise, update the input dataframe.
	columns: single label or list-like, optional
	A list of columns, or a single label which are to be compressed.
	All columns which aren't counted will not be changed.
	If unspecified, all columns will be compressed.
	Returns
	-------
	return_frame: pd.DataFrame
	The compressed dataframe.
	Gist
	----
	https://gist.github.com/johnny-godoy/46979f47c3c9b261744da93ec020fa68"""
	if columns is None:
	columns = frame.columns
	result_frame = frame[columns].copy()

	# Getting min/max for every numeric column
	numerics = result_frame.select_dtypes(["int", "float"])
	c_min, c_max = numerics.min(), numerics.max()

	# Processing integers
	ints = numerics.select_dtypes("int")
	int_columns = ints.columns

	c_min_int, c_max_int = c_min[int_columns], c_max[int_columns]
	int8_cols = (c_min_int > np.iinfo(np.int8).min) & (
	c_max_int < np.iinfo(np.int8).max
	)
	cols = int_columns[int8_cols]
	result_frame[cols] = ints[cols].astype(np.int8)

	c_min_int16, c_max_int16 = c_min_int[~int8_cols], c_max_int[~int8_cols]
	int16_cols = (c_min_int16 > np.iinfo(np.int16).min) & (
	c_max_int16 < np.iinfo(np.int16).max
	)
	cols = c_min_int16[int16_cols].index
	result_frame[cols] = ints[cols].astype(np.int16)

	c_min_int32, c_max_int32 = c_min_int16[~int16_cols], c_max_int16[~int16_cols]
	int32_cols = (c_min_int32 > np.iinfo(np.int32).min) & (
	c_max_int32 < np.iinfo(np.int32).max
	)
	cols = c_min_int32[int32_cols].index
	result_frame[cols] = ints[cols].astype(np.int32)

	# Processing floats
	floats = numerics.select_dtypes("float")
	float_columns = floats.columns

	c_min_float, c_max_float = c_min[float_columns], c_max[float_columns]
	float16_cols = (c_min_float > np.finfo(np.float16).min) & (
	c_max_float < np.finfo(np.float16).max
	)
	cols = float_columns[float16_cols]
	result_frame[cols] = floats[cols].astype(np.float16)

	c_min_float32, c_max_float32 = (
	c_min_float[~float16_cols],
	c_max_float[~float16_cols],
	)
	float32_cols = (c_min_float32 > np.finfo(np.float32).min) & (
	c_max_float32 < np.finfo(np.float32).max
	)
	cols = c_min_float32[float32_cols].index
	result_frame[cols] = floats[cols].astype(np.float32)

	# Processing objects
	objects = result_frame.select_dtypes("object")
	object_columns = objects.columns
	categories = result_frame[object_columns].astype("category")
	to_change = object_columns[
	categories.memory_usage(index=False) < objects.memory_usage(index=False)
	]
	result_frame[to_change] = categories[to_change]

	if inplace:
	# noinspection PyProtectedMember
	frame._update_inplace(result_frame)
	return result_frame