Last active
April 10, 2023 08:45
-
-
Save Kadam-Tushar/da7769453467308426ce7bd97034fab3 to your computer and use it in GitHub Desktop.
Optimizing memory required by pandas dataframe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gc | |
import numpy as np | |
import pandas as pd | |
''' | |
Manytimes in-memory (RAM) requirment of pandas dataframe is bottleneck for how much data we can load in our dataframes. | |
So one way to reduce memory required by pandas dataframe is changing datatypes of columns to smallest datatype possible. | |
pandas auto-assigns datatype and chooses a datatype of maximum precision / maximum bits to be on safer side. | |
''' | |
''' | |
# todos: | |
Converting strings columns to cat_code columns if there are considerbly less no. of distinct strings | |
Converting to datetype format if its possible | |
''' | |
def optimize_df(x): | |
''' | |
param x: series representing each column of dataframe | |
In this function we try to find min and max values for each column and try to change column datatype | |
to smaller datatype if possible for e.g int64 -> int16 or float64->float32. | |
This code is modified from https://www.mikulskibartosz.name/how-to-reduce-memory-usage-in-pandas/ | |
Instead iterating over each column I have used df.apply method which uses vectorization and speeds up process | |
when there are many columns. | |
''' | |
col_type=x.dtype | |
if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name: | |
c_min = x.min() | |
c_max = x.max() | |
treat_as_int = str(col_type)[:3] == 'int' | |
if treat_as_int: | |
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: | |
key = np.int8 | |
elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max: | |
key = np.uint8 | |
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: | |
key = np.int16 | |
elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max: | |
key=np.uint16 | |
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: | |
key=np.int32 | |
elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max: | |
key=np.np.uint32 | |
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: | |
key=np.int64 | |
elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max: | |
key=np.uint64 | |
else: | |
if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: | |
key=np.float32 | |
else: | |
key=np.float64 | |
# if we know certain columns are datatime we can change them to datetime datatype to reduce memory requirements | |
elif 'datetime' not in col_type.name: | |
key='category' | |
return key | |
# example dataframe to optimize | |
df = pd.DataFrame() | |
df['price'] = pd.Series([1.0,2.0,3.0]) | |
df['count'] = pd.Series([1,2,3]) | |
start_mem = df.memory_usage().sum() / 1024 ** 2 | |
gc.collect() | |
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem)) | |
df = df.astype(dict(df.apply(optimize_df,axis=0))) | |
gc.collect() | |
end_mem = df.memory_usage().sum() / 1024 ** 2 | |
print('Memory usage after optimization is: {:.3f} MB'.format(end_mem)) | |
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment