fny/shrink_dataframe.py

## shrink_dataframe.py
import numpy as np

def shrink_df(df, categorize=False, verbose=False):
    """Reduces the memory use of a data frame by using more compact types.
        Args:
            df (pandas.DataFrame): The dataframe
            categorize (bool): Whether strings should be converted to categorical values.
                               Note this may cause memory use to increase slightly.
            verbose (bool): Whether to print memory savings to stdout.

        Returns:
            df (pandas.DataFrame) A shrunken data frame.

    """
    if verbose:
        start_mem = df.memory_usage().sum() / (1024**2)
        print('Memory usage of dataframe: {:.2f} MB'.format(start_mem))

    int_types = [np.int8, np.int16, np.int32, np.int64]
    float_types = [np.float16, np.float32, np.float64]

    for col in df.columns:
        col_type = df[col].dtype
        c_min = df[col].min()
        c_max = df[col].max()

        # Shrink ints
        if col_type in int_types:
            for int_type in int_types:
                if col_type == int_type:
                    break
                if c_min >= np.iinfo(int_type).min and c_max <= np.iinfo(int_type).max:
                    df[col] = df[col].astype(int_type)
                    break

        # Shrink floats
        if col_type in float_types:
            for float_type in float_types:
                if col_type == float_type:
                    break
                if c_min >= np.iinfo(int_type).min and c_max <= np.iinfo(int_type).max:
                    df[col] = df[col].astype(float_type)
                    break

        # Optionally treat strings as categories
        if categorize and col_type == object:
              df[col] = df[col].astype('category')

    if verbose:
        end_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
        print('Savings of {:.1f}%!'.format(100 * (start_mem - end_mem) / start_mem))

    return df
	import numpy as np

	def shrink_df(df, categorize=False, verbose=False):
	"""Reduces the memory use of a data frame by using more compact types.
	Args:
	df (pandas.DataFrame): The dataframe
	categorize (bool): Whether strings should be converted to categorical values.
	Note this may cause memory use to increase slightly.
	verbose (bool): Whether to print memory savings to stdout.

	Returns:
	df (pandas.DataFrame) A shrunken data frame.

	"""
	if verbose:
	start_mem = df.memory_usage().sum() / (1024**2)
	print('Memory usage of dataframe: {:.2f} MB'.format(start_mem))

	int_types = [np.int8, np.int16, np.int32, np.int64]
	float_types = [np.float16, np.float32, np.float64]

	for col in df.columns:
	col_type = df[col].dtype
	c_min = df[col].min()
	c_max = df[col].max()

	# Shrink ints
	if col_type in int_types:
	for int_type in int_types:
	if col_type == int_type:
	break
	if c_min >= np.iinfo(int_type).min and c_max <= np.iinfo(int_type).max:
	df[col] = df[col].astype(int_type)
	break

	# Shrink floats
	if col_type in float_types:
	for float_type in float_types:
	if col_type == float_type:
	break
	if c_min >= np.iinfo(int_type).min and c_max <= np.iinfo(int_type).max:
	df[col] = df[col].astype(float_type)
	break

	# Optionally treat strings as categories
	if categorize and col_type == object:
	df[col] = df[col].astype('category')

	if verbose:
	end_mem = df.memory_usage().sum() / 1024**2
	print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
	print('Savings of {:.1f}%!'.format(100 * (start_mem - end_mem) / start_mem))

	return df