Skip to content

Instantly share code, notes, and snippets.

@yohann84L
Last active July 18, 2019 09:50
Show Gist options
  • Save yohann84L/03c991a361e7f294ea5306d992914cdb to your computer and use it in GitHub Desktop.
Save yohann84L/03c991a361e7f294ea5306d992914cdb to your computer and use it in GitHub Desktop.
def reduce_mem_usage(props):
"""
Code seen on https://www.kaggle.com/jesucristo/fraud-complete-eda
to reduce memory usage of a dataframe by using the write
dtype for each variable.
/!\ Be careful with it's usage, dat could be broken after /!\
NaN value are replaced by -1
"""
from IPython import get_ipython
from os import name, system
is_running_from_nb = get_ipython() is not None
if name == 'nt':
clear_method = 'cls'
else:
clear_method = 'clear'
def _update_tab(table, headers, new_value, from_ipython, clear):
if from_ipython:
clear_output(wait=True)
table.append(new_value)
print(tabulate(table, headers))
else:
system(clear_method)
table.append(new_value)
print(tabulate(table, headers))
return table, headers
start_mem_usg = props.memory_usage().sum() / 1024**2
header = ['Column', 'dtype before', 'dtype after']
table = []
NAlist = [] # Keeps track of columns that have missing values filled in.
for col in props.columns:
if not props[col].dtype.name in ['object', 'category']: # Exclude strings
dtype_before = props[col].dtype
# Integer does not support NA, therefore, NA needs to be filled
if not np.isfinite(props[col]).all():
NAlist.append(col)
props[col] = props[col].fillna(-1)
# make variables for Int, max and min
IsInt = False
mx = props[col].max()
mn = props[col].min()
# test if column can be converted to an integer
asint = props[col].fillna(0).astype(np.int64)
result = (props[col] - asint)
result = result.sum()
if result > -0.01 and result < 0.01:
IsInt = True
# Make Integer/unsigned Integer datatypes
if IsInt:
if mn >= 0:
if mx < 255:
props[col] = props[col].astype(np.uint8)
elif mx < 65535:
props[col] = props[col].astype(np.uint16)
elif mx < 4294967295:
props[col] = props[col].astype(np.uint32)
else:
props[col] = props[col].astype(np.uint64)
else:
if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
props[col] = props[col].astype(np.int8)
elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
props[col] = props[col].astype(np.int16)
elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
props[col] = props[col].astype(np.int32)
elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
props[col] = props[col].astype(np.int64)
# Make float datatypes 32 bit
else:
props[col] = props[col].astype(np.float32)
_update_tab(table, header, [col, dtype_before, props[col].dtype], is_running_from_nb, clear_method)
# Print final result
print()
print("___MEMORY USAGE BEFORE COMPLETION:___")
print("Memory usage is: ",start_mem_usg," MB")
print()
print("___MEMORY USAGE AFTER COMPLETION:___")
mem_usg = props.memory_usage().sum() / 1024**2
print("Memory usage is: ",mem_usg," MB")
print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
return props, NAlist
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment