TAJD/save_memory.py

## save_memory.py
import numpy as np
import pandas as pd


def check_if_integer(col: pd.Series) -> bool:
    """Check if changing the data type of a series to integer changes its values."""
    return np.array_equal(col, col.astype(int))


def reduce_mem_usage(
    df: pd.DataFrame,
    int_cast: bool = True,
    obj_to_category: bool = False,
    subset: bool = None,
) -> pd.DataFrame:
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    :param df: dataframe to reduce (pd.DataFrame)
    :param int_cast: indicate if columns should be tried to be casted to int (bool)
    :param obj_to_category: convert non-datetime related objects to category dtype (bool)
    :param subset: subset of columns to analyse (list)
    :return: dataframe with the column dtypes adjusted (pd.DataFrame)
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))

    cols = subset if subset is not None else df.columns.tolist()

    for col in cols:
        col_type = df[col].dtype

        if (
            col_type != object
            and col_type.name != "category"
            and "datetime" not in col_type.name
        ):
            c_min = df[col].min()
            c_max = df[col].max()

            # test if column can be converted to an integer
            treat_as_int = str(col_type)[:3] == "int"
            if int_cast and not treat_as_int:
                treat_as_int = check_if_integer(df[col])

            if treat_as_int:
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                    df[col] = df[col].astype(np.uint8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif (
                    c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max
                ):
                    df[col] = df[col].astype(np.uint16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif (
                    c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max
                ):
                    df[col] = df[col].astype(np.uint32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                elif (
                    c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max
                ):
                    df[col] = df[col].astype(np.uint64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif "datetime" not in col_type.name and obj_to_category:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print("Memory usage after optimization is: {:.3f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df

# example usage
df = pd.DataFrame(
    {
        "a": pd.Series(np.arange(1000), dtype="int64"),
        "b": pd.Series(np.arange(1000.0), dtype="float64"),
    }
)

df_compressed = reduce_mem_usage(df)
	import numpy as np
	import pandas as pd


	def check_if_integer(col: pd.Series) -> bool:
	"""Check if changing the data type of a series to integer changes its values."""
	return np.array_equal(col, col.astype(int))


	def reduce_mem_usage(
	df: pd.DataFrame,
	int_cast: bool = True,
	obj_to_category: bool = False,
	subset: bool = None,
	) -> pd.DataFrame:
	"""
	Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
	:param df: dataframe to reduce (pd.DataFrame)
	:param int_cast: indicate if columns should be tried to be casted to int (bool)
	:param obj_to_category: convert non-datetime related objects to category dtype (bool)
	:param subset: subset of columns to analyse (list)
	:return: dataframe with the column dtypes adjusted (pd.DataFrame)
	"""
	start_mem = df.memory_usage().sum() / 1024 ** 2
	print("Memory usage of dataframe is {:.2f} MB".format(start_mem))

	cols = subset if subset is not None else df.columns.tolist()

	for col in cols:
	col_type = df[col].dtype

	if (
	col_type != object
	and col_type.name != "category"
	and "datetime" not in col_type.name
	):
	c_min = df[col].min()
	c_max = df[col].max()

	# test if column can be converted to an integer
	treat_as_int = str(col_type)[:3] == "int"
	if int_cast and not treat_as_int:
	treat_as_int = check_if_integer(df[col])

	if treat_as_int:
	if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
	df[col] = df[col].astype(np.int8)
	elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
	df[col] = df[col].astype(np.uint8)
	elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
	df[col] = df[col].astype(np.int16)
	elif (
	c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max
	):
	df[col] = df[col].astype(np.uint16)
	elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
	df[col] = df[col].astype(np.int32)
	elif (
	c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max
	):
	df[col] = df[col].astype(np.uint32)
	elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
	df[col] = df[col].astype(np.int64)
	elif (
	c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max
	):
	df[col] = df[col].astype(np.uint64)
	else:
	if (
	c_min > np.finfo(np.float16).min
	and c_max < np.finfo(np.float16).max
	):
	df[col] = df[col].astype(np.float16)
	elif (
	c_min > np.finfo(np.float32).min
	and c_max < np.finfo(np.float32).max
	):
	df[col] = df[col].astype(np.float32)
	else:
	df[col] = df[col].astype(np.float64)
	elif "datetime" not in col_type.name and obj_to_category:
	df[col] = df[col].astype("category")

	end_mem = df.memory_usage().sum() / 1024 ** 2
	print("Memory usage after optimization is: {:.3f} MB".format(end_mem))
	print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

	return df

	# example usage
	df = pd.DataFrame(
	{
	"a": pd.Series(np.arange(1000), dtype="int64"),
	"b": pd.Series(np.arange(1000.0), dtype="float64"),
	}
	)

	df_compressed = reduce_mem_usage(df)