Kadam-Tushar/optimize_df.py

## optimize_df.py
import gc
import numpy as np
import pandas as pd

'''
Manytimes in-memory (RAM) requirment of pandas dataframe is bottleneck for how much data we can load in our dataframes.
So one way to reduce memory required by pandas dataframe is changing datatypes of columns to smallest datatype possible.
pandas auto-assigns datatype and chooses a datatype of maximum precision / maximum bits to be on safer side.
'''
'''
# todos:
Converting strings columns to cat_code columns if there are considerbly less no. of distinct strings
Converting to datetype format if its possible
'''
def optimize_df(x):
    '''
    param x: series representing each column of dataframe
    In this function we try to find min and max values for each column and try to change column datatype
    to smaller datatype if possible for e.g int64 -> int16 or float64->float32.

    This code is modified from https://www.mikulskibartosz.name/how-to-reduce-memory-usage-in-pandas/
    Instead iterating over each column I have used df.apply method which uses vectorization and speeds up process
    when there are many columns.
    '''
    col_type=x.dtype
    if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name:
        c_min = x.min()
        c_max = x.max()

        treat_as_int = str(col_type)[:3] == 'int'

        if treat_as_int:
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                key = np.int8
            elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                key = np.uint8
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                key = np.int16
            elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                key=np.uint16
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                key=np.int32
            elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                key=np.np.uint32
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                key=np.int64
            elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
                key=np.uint64
        else:
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                key=np.float32
            else:
                key=np.float64

    # if we know certain columns are datatime we can change them to datetime datatype to reduce memory requirements
    elif 'datetime' not in col_type.name:
        key='category'
    return key

# example dataframe to optimize
df = pd.DataFrame()
df['price'] = pd.Series([1.0,2.0,3.0])
df['count'] = pd.Series([1,2,3])


start_mem = df.memory_usage().sum() / 1024 ** 2
gc.collect()
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

df = df.astype(dict(df.apply(optimize_df,axis=0)))

gc.collect()
end_mem = df.memory_usage().sum() / 1024 ** 2
print('Memory usage after optimization is: {:.3f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
	import gc
	import numpy as np
	import pandas as pd

	'''
	Manytimes in-memory (RAM) requirment of pandas dataframe is bottleneck for how much data we can load in our dataframes.
	So one way to reduce memory required by pandas dataframe is changing datatypes of columns to smallest datatype possible.
	pandas auto-assigns datatype and chooses a datatype of maximum precision / maximum bits to be on safer side.
	'''
	'''
	# todos:
	Converting strings columns to cat_code columns if there are considerbly less no. of distinct strings
	Converting to datetype format if its possible
	'''
	def optimize_df(x):
	'''
	param x: series representing each column of dataframe
	In this function we try to find min and max values for each column and try to change column datatype
	to smaller datatype if possible for e.g int64 -> int16 or float64->float32.

	This code is modified from https://www.mikulskibartosz.name/how-to-reduce-memory-usage-in-pandas/
	Instead iterating over each column I have used df.apply method which uses vectorization and speeds up process
	when there are many columns.
	'''
	col_type=x.dtype
	if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name:
	c_min = x.min()
	c_max = x.max()

	treat_as_int = str(col_type)[:3] == 'int'

	if treat_as_int:
	if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
	key = np.int8
	elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
	key = np.uint8
	elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
	key = np.int16
	elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
	key=np.uint16
	elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
	key=np.int32
	elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
	key=np.np.uint32
	elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
	key=np.int64
	elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
	key=np.uint64
	else:
	if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
	key=np.float32
	else:
	key=np.float64

	# if we know certain columns are datatime we can change them to datetime datatype to reduce memory requirements
	elif 'datetime' not in col_type.name:
	key='category'
	return key

	# example dataframe to optimize
	df = pd.DataFrame()
	df['price'] = pd.Series([1.0,2.0,3.0])
	df['count'] = pd.Series([1,2,3])


	start_mem = df.memory_usage().sum() / 1024 ** 2
	gc.collect()
	print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

	df = df.astype(dict(df.apply(optimize_df,axis=0)))

	gc.collect()
	end_mem = df.memory_usage().sum() / 1024 ** 2
	print('Memory usage after optimization is: {:.3f} MB'.format(end_mem))
	print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))