Skip to content

Instantly share code, notes, and snippets.

@kaspermunch
Last active May 25, 2020 12:17
Show Gist options
  • Save kaspermunch/5c40193811675e6c09349181014712b1 to your computer and use it in GitHub Desktop.
Save kaspermunch/5c40193811675e6c09349181014712b1 to your computer and use it in GitHub Desktop.
def optimize_dataframe(df):
converted_df = pandas.DataFrame()
floats_optim = (df
.select_dtypes(include=['float'])
.apply(pandas.to_numeric,downcast='float')
)
converted_df[floats_optim.columns] = floats_optim
ints_optim = (df
.select_dtypes(include=['int'])
.apply(pandas.to_numeric,downcast='integer')
)
converted_df[ints_optim.columns] = ints_optim
for col in df.select_dtypes(include=['object']).columns:
num_unique_values = len(df[col].unique())
num_total_values = len(df[col])
if num_unique_values / num_total_values < 0.5:
converted_df[col] = df[col].astype('category')
else:
converted_df[col] = df[col]
unchanged_cols = df.columns[~df.columns.isin(converted_df.columns)]
converted_df[unchanged_cols] = df[unchanged_cols]
# keep columns order
converted_df = converted_df[df.columns]
return converted_df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment