diamonaj/loess_lm.R

## loess_lm.R
rm(list=ls())
training <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSUROPfTOZfUEpf6Ebby-vta5zWCwt9KK-KAwSvpToGQjQSKdhYsUfoHxYxvbOYxW8_IQxBD9FqWFJg/pub?gid=383144413&single=true&output=csv")

head(training)

# plot the data with big green dots
plot(training$x, training$y, main = "Training Data", pch = 16, cex = 3, col = "green")

################################################
#### RUN 3 DIFFERENT MODELS ON THE TRAINING SET

# first model, BIG SPAN = 1
fit_bigspan <- loess(y ~ x, data = training,
                     span = 1,
                     degree = 1) # fit the regression

# TWEAK the SPAN (SMALL SPAN) = 0.2
fit_smallspan <- loess(y ~ x, data = training,
                       span = 0.2, degree = 1) #

# What about a linear regression model.
# How to fit this training data with a linear regression?

reg1 <- lm(y ~ x,  data = training)
reg1
# please use the 'data = training' format
# do not use the training$x , training$y format


##### OBTAIN TRAINING SET RMSEs FOR THE THREE MODELS
# to get the predicted ys for the training set,
# using loess model with span = 1...
predicted_ys_training_bigspan <- predict(fit_bigspan)

# to get the predicted ys for the training set,
# using loess model with span = 0.2...
predicted_ys_training_smallspan <- predict(fit_smallspan)

# to get the predicted ys for the training set
# using the linear regression model (lm):
predicted_ys_training_lm <- predict(reg1, training)

cat("\nbig span, loess training set, RMSE =",
    sqrt(mean((predicted_ys_training_bigspan - training$y)^2)), "\n")
cat("\nsmall span, loess, training set, RMSE =",
    sqrt(mean((predicted_ys_training_smallspan - training$y)^2)), "\n")
cat("\nlinear model, training set, RMSE =",
    sqrt(mean((predicted_ys_training_lm - training$y)^2)), "\n")


############################################
###### OBTAIN RMSEs FOR THE TEST SET RESULTS
###### USING 3 MODELS ABOVE THAT WERE RAINED ON TRAINING SET

test <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSnxeyJZa8zlij6jD4i0NMHjuJH_SY3bPO293PvSsqneki7fG2f_I6L3KL0QC831U4NSSyuXh8iFV2F/pub?gid=725957927&single=true&output=csv")

## let's remove test set observations outside scope of training set
removed1 <- which(test$x > max(training$x))
removed2 <- which(test$x < min(training$x))
to_be_removed <- c(removed1, removed2)
test <- test[-c(to_be_removed),]


head(test)
dim(test)

# to get the predicted ys for the TEST set
predicted_ys_test_bigspan <-
  predict(fit_bigspan, newdata = test)

predicted_ys_test_smallspan <-
  predict(fit_smallspan, newdata = test)

#predicted_ys_test_bigspan

mse_bigspan_test <- mean((test$y - predicted_ys_test_bigspan)^2)

#predicted_ys_test_smallspan <- predict(fit_smallspan, newdata = test$x)
mse_smallspan_test <- mean((test$y - predicted_ys_test_smallspan)^2)

predicted_ys_test_lm <- predict(reg1, newdata = test)
mse_test_linear_regression <- mean((test$y - predicted_ys_test_lm)^2)


##############################
## PRINT SUMMARY OUTPUT...

cat("\nbig span loess, test set, RMSE = ", sqrt(mse_bigspan_test), "\n")
cat("\nsmall span loess, test set, RMSE = ", sqrt(mse_smallspan_test), "\n")
cat("\nlinear model, test set, RMSE =", sqrt(mse_test_linear_regression), "\n")

cat("\nbig span, loess training set, RMSE =",
    sqrt(mean((predicted_ys_training_bigspan - training$y)^2)), "\n")
cat("\nsmall span, loess, training set, RMSE =",
    sqrt(mean((predicted_ys_training_smallspan - training$y)^2)), "\n")
cat("\nlinear model, training set, RMSE =",
    sqrt(mean((predicted_ys_training_lm - training$y)^2)), "\n")

### TRAINING SET RESULTS ###
# big span loess   RMSE = 6.2
# small span loess RMSE = 1.8
# linear model     RMSE = 7.4

### TEST SET RESULTS ###
# big span loess   RMSE = 6.9
# small span loess RMSE = 4.5
# linear model     RMSE = 8.7

### CONCLUSIONS:
# (a) test set results validated training set's signal that small-span was best
#     - small span generalizes the best to new data
#     - even so, small span overfits (compre 1.8 and 4.5)
#     - would be cool to know actual (DGP) irreducible error: sd(error). < 4.5?
# (b) big span loess & linear model underfit (biased & 'rigid' low var models)
# (c) but big span and loess both ALSO overfit: compare 6.2 < 6.9, & 7.4 < 8.7
# (d) so it's possible to both underfit & overfit(!!!) see the two links below:
#     - https://stats.stackexchange.com/questions/488434/can-overfitting-and-underfitting-occur-simultaneously
#     - https://www.quora.com/Is-it-possible-for-a-Machine-Learning-model-to-simultaneously-overfit-and-underfit-the-training-data

## THE END ##
	rm(list=ls())
	training <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSUROPfTOZfUEpf6Ebby-vta5zWCwt9KK-KAwSvpToGQjQSKdhYsUfoHxYxvbOYxW8_IQxBD9FqWFJg/pub?gid=383144413&single=true&output=csv")

	head(training)

	# plot the data with big green dots
	plot(training$x, training$y, main = "Training Data", pch = 16, cex = 3, col = "green")

	################################################
	#### RUN 3 DIFFERENT MODELS ON THE TRAINING SET

	# first model, BIG SPAN = 1
	fit_bigspan <- loess(y ~ x, data = training,
	span = 1,
	degree = 1) # fit the regression

	# TWEAK the SPAN (SMALL SPAN) = 0.2
	fit_smallspan <- loess(y ~ x, data = training,
	span = 0.2, degree = 1) #

	# What about a linear regression model.
	# How to fit this training data with a linear regression?

	reg1 <- lm(y ~ x, data = training)
	reg1
	# please use the 'data = training' format
	# do not use the training$x , training$y format


	##### OBTAIN TRAINING SET RMSEs FOR THE THREE MODELS
	# to get the predicted ys for the training set,
	# using loess model with span = 1...
	predicted_ys_training_bigspan <- predict(fit_bigspan)

	# to get the predicted ys for the training set,
	# using loess model with span = 0.2...
	predicted_ys_training_smallspan <- predict(fit_smallspan)

	# to get the predicted ys for the training set
	# using the linear regression model (lm):
	predicted_ys_training_lm <- predict(reg1, training)

	cat("\nbig span, loess training set, RMSE =",
	sqrt(mean((predicted_ys_training_bigspan - training$y)^2)), "\n")
	cat("\nsmall span, loess, training set, RMSE =",
	sqrt(mean((predicted_ys_training_smallspan - training$y)^2)), "\n")
	cat("\nlinear model, training set, RMSE =",
	sqrt(mean((predicted_ys_training_lm - training$y)^2)), "\n")


	############################################
	###### OBTAIN RMSEs FOR THE TEST SET RESULTS
	###### USING 3 MODELS ABOVE THAT WERE RAINED ON TRAINING SET

	test <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSnxeyJZa8zlij6jD4i0NMHjuJH_SY3bPO293PvSsqneki7fG2f_I6L3KL0QC831U4NSSyuXh8iFV2F/pub?gid=725957927&single=true&output=csv")

	## let's remove test set observations outside scope of training set
	removed1 <- which(test$x > max(training$x))
	removed2 <- which(test$x < min(training$x))
	to_be_removed <- c(removed1, removed2)
	test <- test[-c(to_be_removed),]


	head(test)
	dim(test)

	# to get the predicted ys for the TEST set
	predicted_ys_test_bigspan <-
	predict(fit_bigspan, newdata = test)

	predicted_ys_test_smallspan <-
	predict(fit_smallspan, newdata = test)

	#predicted_ys_test_bigspan

	mse_bigspan_test <- mean((test$y - predicted_ys_test_bigspan)^2)

	#predicted_ys_test_smallspan <- predict(fit_smallspan, newdata = test$x)
	mse_smallspan_test <- mean((test$y - predicted_ys_test_smallspan)^2)

	predicted_ys_test_lm <- predict(reg1, newdata = test)
	mse_test_linear_regression <- mean((test$y - predicted_ys_test_lm)^2)


	##############################
	## PRINT SUMMARY OUTPUT...

	cat("\nbig span loess, test set, RMSE = ", sqrt(mse_bigspan_test), "\n")
	cat("\nsmall span loess, test set, RMSE = ", sqrt(mse_smallspan_test), "\n")
	cat("\nlinear model, test set, RMSE =", sqrt(mse_test_linear_regression), "\n")

	cat("\nbig span, loess training set, RMSE =",
	sqrt(mean((predicted_ys_training_bigspan - training$y)^2)), "\n")
	cat("\nsmall span, loess, training set, RMSE =",
	sqrt(mean((predicted_ys_training_smallspan - training$y)^2)), "\n")
	cat("\nlinear model, training set, RMSE =",
	sqrt(mean((predicted_ys_training_lm - training$y)^2)), "\n")

	### TRAINING SET RESULTS ###
	# big span loess RMSE = 6.2
	# small span loess RMSE = 1.8
	# linear model RMSE = 7.4

	### TEST SET RESULTS ###
	# big span loess RMSE = 6.9
	# small span loess RMSE = 4.5
	# linear model RMSE = 8.7

	### CONCLUSIONS:
	# (a) test set results validated training set's signal that small-span was best
	# - small span generalizes the best to new data
	# - even so, small span overfits (compre 1.8 and 4.5)
	# - would be cool to know actual (DGP) irreducible error: sd(error). < 4.5?
	# (b) big span loess & linear model underfit (biased & 'rigid' low var models)
	# (c) but big span and loess both ALSO overfit: compare 6.2 < 6.9, & 7.4 < 8.7
	# (d) so it's possible to both underfit & overfit(!!!) see the two links below:
	# - https://stats.stackexchange.com/questions/488434/can-overfitting-and-underfitting-occur-simultaneously
	# - https://www.quora.com/Is-it-possible-for-a-Machine-Learning-model-to-simultaneously-overfit-and-underfit-the-training-data

	## THE END ##