fclesio/difference-between-rmse-rmsle.py

## difference-between-rmse-rmsle.py
import pandas as pd
import math
import numpy as np

# Create dataframe
df_preds = pd.DataFrame(columns=['y', 'y_hat'])

# Fill it
df_preds.loc[len(df_preds)] = [1, 1]
df_preds.loc[len(df_preds)] = [2, 3]
df_preds.loc[len(df_preds)] = [50, 55]
df_preds.loc[len(df_preds)] = [500, 502]
df_preds.loc[len(df_preds)] = [1000000, 1000005]

# Check
df_preds

#          y    y_hat
# 0        1        1
# 1        2        3
# 2       50       55
# 3      500      502
# 4  1000000  1000005

# Create functions
def rmse(predictions, targets):
    '''Source: https://stackoverflow.com/questions/17197492/is-there-a-library-function-for-root-mean-square-error-rmse-in-python'''
    return np.sqrt(((predictions - targets) ** 2).mean())

def rmsle(predict, target):
    '''Source: https://towardsdatascience.com/metrics-and-python-850b60710e0c'''
    total = 0
    for k in range(len(predict)):
        LPred= np.log1p(predict[k]+1)
        LTarg = np.log1p(target[k] + 1)
        if not (math.isnan(LPred)) and  not (math.isnan(LTarg)):
            total = total + ((LPred-LTarg) **2)

    total = total / len(predict)
    return np.sqrt(total)


# Check data before executiion
df_preds

#          y    y_hat
# 0        1        1
# 1        2        3
# 2       50       55
# 3      500      502
# 4  1000000  1000005

# Get stats
print ('RMSE: ' + str(rmse(df_preds['y_hat'].values, df_preds['y'].values)))
print ('RMSLE: ' + str(rmsle(df_preds['y_hat'].values, df_preds['y'].values)))
# RMSE: 3.3166247903554
# RMSLE: 0.1079235658917167


# Increase the error in the biggest number in terms of magnitude (from 5 to 500)

# Create dataframe
df_preds = pd.DataFrame(columns=['y', 'y_hat'])

# Fill it
df_preds.loc[len(df_preds)] = [1, 1]
df_preds.loc[len(df_preds)] = [2, 3]
df_preds.loc[len(df_preds)] = [50, 55]
df_preds.loc[len(df_preds)] = [500, 502]
df_preds.loc[len(df_preds)] = [1000000, 1000500]

# Check
df_preds

#          y    y_hat
# 0        1        1
# 1        2        3
# 2       50       55
# 3      500      502
# 4  1000000  1000500

# The RMSE exploded, but the RMSLE stayed the same due to not penalize the error in bigger magnitude predictions
print ('RMSE: ' + str(rmse(df_preds['y_hat'].values, df_preds['y'].values)))
print ('RMSLE: ' + str(rmsle(df_preds['y_hat'].values, df_preds['y'].values)))
# RMSE: 223.6202137553759
# RMSLE: 0.10792379739703087
	import pandas as pd
	import math
	import numpy as np

	# Create dataframe
	df_preds = pd.DataFrame(columns=['y', 'y_hat'])

	# Fill it
	df_preds.loc[len(df_preds)] = [1, 1]
	df_preds.loc[len(df_preds)] = [2, 3]
	df_preds.loc[len(df_preds)] = [50, 55]
	df_preds.loc[len(df_preds)] = [500, 502]
	df_preds.loc[len(df_preds)] = [1000000, 1000005]

	# Check
	df_preds

	# y y_hat
	# 0 1 1
	# 1 2 3
	# 2 50 55
	# 3 500 502
	# 4 1000000 1000005

	# Create functions
	def rmse(predictions, targets):
	'''Source: https://stackoverflow.com/questions/17197492/is-there-a-library-function-for-root-mean-square-error-rmse-in-python'''
	return np.sqrt(((predictions - targets) ** 2).mean())

	def rmsle(predict, target):
	'''Source: https://towardsdatascience.com/metrics-and-python-850b60710e0c'''
	total = 0
	for k in range(len(predict)):
	LPred= np.log1p(predict[k]+1)
	LTarg = np.log1p(target[k] + 1)
	if not (math.isnan(LPred)) and not (math.isnan(LTarg)):
	total = total + ((LPred-LTarg) **2)

	total = total / len(predict)
	return np.sqrt(total)


	# Check data before executiion
	df_preds

	# y y_hat
	# 0 1 1
	# 1 2 3
	# 2 50 55
	# 3 500 502
	# 4 1000000 1000005

	# Get stats
	print ('RMSE: ' + str(rmse(df_preds['y_hat'].values, df_preds['y'].values)))
	print ('RMSLE: ' + str(rmsle(df_preds['y_hat'].values, df_preds['y'].values)))
	# RMSE: 3.3166247903554
	# RMSLE: 0.1079235658917167


	# Increase the error in the biggest number in terms of magnitude (from 5 to 500)

	# Create dataframe
	df_preds = pd.DataFrame(columns=['y', 'y_hat'])

	# Fill it
	df_preds.loc[len(df_preds)] = [1, 1]
	df_preds.loc[len(df_preds)] = [2, 3]
	df_preds.loc[len(df_preds)] = [50, 55]
	df_preds.loc[len(df_preds)] = [500, 502]
	df_preds.loc[len(df_preds)] = [1000000, 1000500]

	# Check
	df_preds

	# y y_hat
	# 0 1 1
	# 1 2 3
	# 2 50 55
	# 3 500 502
	# 4 1000000 1000500

	# The RMSE exploded, but the RMSLE stayed the same due to not penalize the error in bigger magnitude predictions
	print ('RMSE: ' + str(rmse(df_preds['y_hat'].values, df_preds['y'].values)))
	print ('RMSLE: ' + str(rmsle(df_preds['y_hat'].values, df_preds['y'].values)))
	# RMSE: 223.6202137553759
	# RMSLE: 0.10792379739703087