Last active
July 18, 2019 09:50
-
-
Save yohann84L/03c991a361e7f294ea5306d992914cdb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def reduce_mem_usage(props): | |
""" | |
Code seen on https://www.kaggle.com/jesucristo/fraud-complete-eda | |
to reduce memory usage of a dataframe by using the write | |
dtype for each variable. | |
/!\ Be careful with it's usage, dat could be broken after /!\ | |
NaN value are replaced by -1 | |
""" | |
from IPython import get_ipython | |
from os import name, system | |
is_running_from_nb = get_ipython() is not None | |
if name == 'nt': | |
clear_method = 'cls' | |
else: | |
clear_method = 'clear' | |
def _update_tab(table, headers, new_value, from_ipython, clear): | |
if from_ipython: | |
clear_output(wait=True) | |
table.append(new_value) | |
print(tabulate(table, headers)) | |
else: | |
system(clear_method) | |
table.append(new_value) | |
print(tabulate(table, headers)) | |
return table, headers | |
start_mem_usg = props.memory_usage().sum() / 1024**2 | |
header = ['Column', 'dtype before', 'dtype after'] | |
table = [] | |
NAlist = [] # Keeps track of columns that have missing values filled in. | |
for col in props.columns: | |
if not props[col].dtype.name in ['object', 'category']: # Exclude strings | |
dtype_before = props[col].dtype | |
# Integer does not support NA, therefore, NA needs to be filled | |
if not np.isfinite(props[col]).all(): | |
NAlist.append(col) | |
props[col] = props[col].fillna(-1) | |
# make variables for Int, max and min | |
IsInt = False | |
mx = props[col].max() | |
mn = props[col].min() | |
# test if column can be converted to an integer | |
asint = props[col].fillna(0).astype(np.int64) | |
result = (props[col] - asint) | |
result = result.sum() | |
if result > -0.01 and result < 0.01: | |
IsInt = True | |
# Make Integer/unsigned Integer datatypes | |
if IsInt: | |
if mn >= 0: | |
if mx < 255: | |
props[col] = props[col].astype(np.uint8) | |
elif mx < 65535: | |
props[col] = props[col].astype(np.uint16) | |
elif mx < 4294967295: | |
props[col] = props[col].astype(np.uint32) | |
else: | |
props[col] = props[col].astype(np.uint64) | |
else: | |
if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max: | |
props[col] = props[col].astype(np.int8) | |
elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max: | |
props[col] = props[col].astype(np.int16) | |
elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max: | |
props[col] = props[col].astype(np.int32) | |
elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max: | |
props[col] = props[col].astype(np.int64) | |
# Make float datatypes 32 bit | |
else: | |
props[col] = props[col].astype(np.float32) | |
_update_tab(table, header, [col, dtype_before, props[col].dtype], is_running_from_nb, clear_method) | |
# Print final result | |
print() | |
print("___MEMORY USAGE BEFORE COMPLETION:___") | |
print("Memory usage is: ",start_mem_usg," MB") | |
print() | |
print("___MEMORY USAGE AFTER COMPLETION:___") | |
mem_usg = props.memory_usage().sum() / 1024**2 | |
print("Memory usage is: ",mem_usg," MB") | |
print("This is ",100*mem_usg/start_mem_usg,"% of the initial size") | |
return props, NAlist |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment