Created
March 22, 2020 14:28
-
-
Save fclesio/1fc8ae544b55059a1eee9557901daee3 to your computer and use it in GitHub Desktop.
Sensibility difference between RMSE and RMSLE
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import math | |
import numpy as np | |
# Create dataframe | |
df_preds = pd.DataFrame(columns=['y', 'y_hat']) | |
# Fill it | |
df_preds.loc[len(df_preds)] = [1, 1] | |
df_preds.loc[len(df_preds)] = [2, 3] | |
df_preds.loc[len(df_preds)] = [50, 55] | |
df_preds.loc[len(df_preds)] = [500, 502] | |
df_preds.loc[len(df_preds)] = [1000000, 1000005] | |
# Check | |
df_preds | |
# y y_hat | |
# 0 1 1 | |
# 1 2 3 | |
# 2 50 55 | |
# 3 500 502 | |
# 4 1000000 1000005 | |
# Create functions | |
def rmse(predictions, targets): | |
'''Source: https://stackoverflow.com/questions/17197492/is-there-a-library-function-for-root-mean-square-error-rmse-in-python''' | |
return np.sqrt(((predictions - targets) ** 2).mean()) | |
def rmsle(predict, target): | |
'''Source: https://towardsdatascience.com/metrics-and-python-850b60710e0c''' | |
total = 0 | |
for k in range(len(predict)): | |
LPred= np.log1p(predict[k]+1) | |
LTarg = np.log1p(target[k] + 1) | |
if not (math.isnan(LPred)) and not (math.isnan(LTarg)): | |
total = total + ((LPred-LTarg) **2) | |
total = total / len(predict) | |
return np.sqrt(total) | |
# Check data before executiion | |
df_preds | |
# y y_hat | |
# 0 1 1 | |
# 1 2 3 | |
# 2 50 55 | |
# 3 500 502 | |
# 4 1000000 1000005 | |
# Get stats | |
print ('RMSE: ' + str(rmse(df_preds['y_hat'].values, df_preds['y'].values))) | |
print ('RMSLE: ' + str(rmsle(df_preds['y_hat'].values, df_preds['y'].values))) | |
# RMSE: 3.3166247903554 | |
# RMSLE: 0.1079235658917167 | |
# Increase the error in the biggest number in terms of magnitude (from 5 to 500) | |
# Create dataframe | |
df_preds = pd.DataFrame(columns=['y', 'y_hat']) | |
# Fill it | |
df_preds.loc[len(df_preds)] = [1, 1] | |
df_preds.loc[len(df_preds)] = [2, 3] | |
df_preds.loc[len(df_preds)] = [50, 55] | |
df_preds.loc[len(df_preds)] = [500, 502] | |
df_preds.loc[len(df_preds)] = [1000000, 1000500] | |
# Check | |
df_preds | |
# y y_hat | |
# 0 1 1 | |
# 1 2 3 | |
# 2 50 55 | |
# 3 500 502 | |
# 4 1000000 1000500 | |
# The RMSE exploded, but the RMSLE stayed the same due to not penalize the error in bigger magnitude predictions | |
print ('RMSE: ' + str(rmse(df_preds['y_hat'].values, df_preds['y'].values))) | |
print ('RMSLE: ' + str(rmsle(df_preds['y_hat'].values, df_preds['y'].values))) | |
# RMSE: 223.6202137553759 | |
# RMSLE: 0.10792379739703087 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment