Created
December 13, 2019 00:12
-
-
Save fny/279e7cb3ae06acafe3178a12741ba4c0 to your computer and use it in GitHub Desktop.
Reduce Memory Usage of a Pandas Dataframe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
def shrink_df(df, categorize=False, verbose=False): | |
"""Reduces the memory use of a data frame by using more compact types. | |
Args: | |
df (pandas.DataFrame): The dataframe | |
categorize (bool): Whether strings should be converted to categorical values. | |
Note this may cause memory use to increase slightly. | |
verbose (bool): Whether to print memory savings to stdout. | |
Returns: | |
df (pandas.DataFrame) A shrunken data frame. | |
""" | |
if verbose: | |
start_mem = df.memory_usage().sum() / (1024**2) | |
print('Memory usage of dataframe: {:.2f} MB'.format(start_mem)) | |
int_types = [np.int8, np.int16, np.int32, np.int64] | |
float_types = [np.float16, np.float32, np.float64] | |
for col in df.columns: | |
col_type = df[col].dtype | |
c_min = df[col].min() | |
c_max = df[col].max() | |
# Shrink ints | |
if col_type in int_types: | |
for int_type in int_types: | |
if col_type == int_type: | |
break | |
if c_min >= np.iinfo(int_type).min and c_max <= np.iinfo(int_type).max: | |
df[col] = df[col].astype(int_type) | |
break | |
# Shrink floats | |
if col_type in float_types: | |
for float_type in float_types: | |
if col_type == float_type: | |
break | |
if c_min >= np.iinfo(int_type).min and c_max <= np.iinfo(int_type).max: | |
df[col] = df[col].astype(float_type) | |
break | |
# Optionally treat strings as categories | |
if categorize and col_type == object: | |
df[col] = df[col].astype('category') | |
if verbose: | |
end_mem = df.memory_usage().sum() / 1024**2 | |
print('Memory usage after optimization: {:.2f} MB'.format(end_mem)) | |
print('Savings of {:.1f}%!'.format(100 * (start_mem - end_mem) / start_mem)) | |
return df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment