Skip to content

Instantly share code, notes, and snippets.

@diamonaj
Last active January 20, 2024 21:12
Show Gist options
  • Save diamonaj/d01328f2aa6377fbb2838c8d9d51f786 to your computer and use it in GitHub Desktop.
Save diamonaj/d01328f2aa6377fbb2838c8d9d51f786 to your computer and use it in GitHub Desktop.
CS130 LP 130 (Regression)
rm(list=ls())
training <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSUROPfTOZfUEpf6Ebby-vta5zWCwt9KK-KAwSvpToGQjQSKdhYsUfoHxYxvbOYxW8_IQxBD9FqWFJg/pub?gid=383144413&single=true&output=csv")
head(training)
# plot the data with big green dots
plot(training$x, training$y, main = "Training Data", pch = 16, cex = 3, col = "green")
################################################
#### RUN 3 DIFFERENT MODELS ON THE TRAINING SET
# first model, BIG SPAN = 1
fit_bigspan <- loess(y ~ x, data = training,
span = 1,
degree = 1) # fit the regression
# TWEAK the SPAN (SMALL SPAN) = 0.2
fit_smallspan <- loess(y ~ x, data = training,
span = 0.2, degree = 1) #
# What about a linear regression model.
# How to fit this training data with a linear regression?
reg1 <- lm(y ~ x, data = training)
reg1
# please use the 'data = training' format
# do not use the training$x , training$y format
##### OBTAIN TRAINING SET RMSEs FOR THE THREE MODELS
# to get the predicted ys for the training set,
# using loess model with span = 1...
predicted_ys_training_bigspan <- predict(fit_bigspan)
# to get the predicted ys for the training set,
# using loess model with span = 0.2...
predicted_ys_training_smallspan <- predict(fit_smallspan)
# to get the predicted ys for the training set
# using the linear regression model (lm):
predicted_ys_training_lm <- predict(reg1, training)
cat("\nbig span, loess training set, RMSE =",
sqrt(mean((predicted_ys_training_bigspan - training$y)^2)), "\n")
cat("\nsmall span, loess, training set, RMSE =",
sqrt(mean((predicted_ys_training_smallspan - training$y)^2)), "\n")
cat("\nlinear model, training set, RMSE =",
sqrt(mean((predicted_ys_training_lm - training$y)^2)), "\n")
############################################
###### OBTAIN RMSEs FOR THE TEST SET RESULTS
###### USING 3 MODELS ABOVE THAT WERE RAINED ON TRAINING SET
test <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSnxeyJZa8zlij6jD4i0NMHjuJH_SY3bPO293PvSsqneki7fG2f_I6L3KL0QC831U4NSSyuXh8iFV2F/pub?gid=725957927&single=true&output=csv")
## let's remove test set observations outside scope of training set
removed1 <- which(test$x > max(training$x))
removed2 <- which(test$x < min(training$x))
to_be_removed <- c(removed1, removed2)
test <- test[-c(to_be_removed),]
head(test)
dim(test)
# to get the predicted ys for the TEST set
predicted_ys_test_bigspan <-
predict(fit_bigspan, newdata = test)
predicted_ys_test_smallspan <-
predict(fit_smallspan, newdata = test)
#predicted_ys_test_bigspan
mse_bigspan_test <- mean((test$y - predicted_ys_test_bigspan)^2)
#predicted_ys_test_smallspan <- predict(fit_smallspan, newdata = test$x)
mse_smallspan_test <- mean((test$y - predicted_ys_test_smallspan)^2)
predicted_ys_test_lm <- predict(reg1, newdata = test)
mse_test_linear_regression <- mean((test$y - predicted_ys_test_lm)^2)
##############################
## PRINT SUMMARY OUTPUT...
cat("\nbig span loess, test set, RMSE = ", sqrt(mse_bigspan_test), "\n")
cat("\nsmall span loess, test set, RMSE = ", sqrt(mse_smallspan_test), "\n")
cat("\nlinear model, test set, RMSE =", sqrt(mse_test_linear_regression), "\n")
cat("\nbig span, loess training set, RMSE =",
sqrt(mean((predicted_ys_training_bigspan - training$y)^2)), "\n")
cat("\nsmall span, loess, training set, RMSE =",
sqrt(mean((predicted_ys_training_smallspan - training$y)^2)), "\n")
cat("\nlinear model, training set, RMSE =",
sqrt(mean((predicted_ys_training_lm - training$y)^2)), "\n")
### TRAINING SET RESULTS ###
# big span loess RMSE = 6.2
# small span loess RMSE = 1.8
# linear model RMSE = 7.4
### TEST SET RESULTS ###
# big span loess RMSE = 6.9
# small span loess RMSE = 4.5
# linear model RMSE = 8.7
### CONCLUSIONS:
# (a) test set results validated training set's signal that small-span was best
# - small span generalizes the best to new data
# - even so, small span overfits (compre 1.8 and 4.5)
# - would be cool to know actual (DGP) irreducible error: sd(error). < 4.5?
# (b) big span loess & linear model underfit (biased & 'rigid' low var models)
# (c) but big span and loess both ALSO overfit: compare 6.2 < 6.9, & 7.4 < 8.7
# (d) so it's possible to both underfit & overfit(!!!) see the two links below:
# - https://stats.stackexchange.com/questions/488434/can-overfitting-and-underfitting-occur-simultaneously
# - https://www.quora.com/Is-it-possible-for-a-Machine-Learning-model-to-simultaneously-overfit-and-underfit-the-training-data
## THE END ##
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment