Code from my blog post on model overfitting. Code generates simulated data, has an inner loop that fits that data given a range of polynomials in lm(), and nests that in a loop that does it over a bunch of simulated data sets. There are two plots, 1) an animated gif of the fit line for each polynomial on the train and test set and 2) a overall c…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# function to generate samples of data | |
get_data <- function(n, exp_i, intercept, beta1, sigma, plot = TRUE){ | |
library(lattice) | |
x <- sample(seq(1,100,0.1), n, replace = TRUE) | |
gen_model <- intercept + beta1 * x^(exp_i) | |
y <- rnorm(n, gen_model, sigma) | |
dat <- data.frame(x, y) | |
if(isTRUE(plot)){ | |
print(xyplot(y~x, type=c("smooth", "p"),col.line = "darkorange", lwd = 2)) | |
} | |
return(dat) | |
} | |
# prediction error function | |
RMSE <- function(y, yhat){ | |
sqrt(mean((y-yhat)^2)) | |
} | |
# libraries | |
require(ggplot2) # plotting | |
require(gganimate) # for the animation | |
require(reshape2) # for melt() | |
require(dplyr) # for some data munging | |
# fixed parameters | |
n <- 100 # number of samples | |
exp_i <- (-0.3) # model exponent | |
x_sim <- seq(1,100,1) # range of x variable | |
poly_range <- seq(1,20,1) # range of polynomials to fit | |
# number of times to rerun process for aggregate plot | |
repeats <- 100 | |
#random parameters | |
intercept <- rnorm(1,50,1) # model intercept | |
beta1 <- rnorm(1,30,1) # model beta | |
sigma <- rnorm(1,2.5,0.1) # sigma | |
plot_dat <- NULL | |
# outer loop for re-running simulation for second plot | |
for(z in 1:repeats){ | |
print(z) | |
# grab samples for training and testing | |
dat_train <- get_data(n, exp_i, intercept, beta1, e, plot = FALSE) | |
dat_test <- get_data(500, exp_i, intercept, beta1, e, plot = FALSE) | |
train_result <- NULL | |
train_err <- NULL | |
test_result <- NULL | |
test_err <- NULL | |
# inner loop that runs the model for each of the polynomial fits | |
for(i in seq_along(poly_range)){ | |
# training set model fit | |
mod <- lm(y ~ poly(x,poly_range[i], raw=TRUE), data = dat_train) | |
# training set prediction | |
pred_trn <- predict(mod) | |
# calc training error | |
trn_err <- round(RMSE(dat_train$y, pred_trn),3) | |
# build out the results | |
train_err <- c(train_err, trn_err) | |
pred_train <- data.frame(yhat = pred_trn, dat_train, RMSE = trn_err, | |
poly = poly_range[i], dataset = "Training") | |
train_result <- rbind(train_result, pred_train) | |
# testing set prediction using model above | |
pred_tst <- predict(mod, newdata = dat_test) | |
# calc testing error | |
tst_err <- round(RMSE(dat_test$y, pred_tst),3) | |
# build out testing results | |
test_err <- c(test_err, tst_err) | |
pred_test <- data.frame(yhat = pred_tst, dat_test, RMSE = tst_err, | |
poly = poly_range[i], dataset = "Testing") | |
test_result <- rbind(test_result, pred_test) | |
} | |
# aggregate results from loop above | |
err_plot_dat <- data.frame(Training = train_err, z = z, | |
Testing = test_err, Degree = poly_range) | |
plot_dat <- rbind(plot_dat, err_plot_dat) | |
} | |
### This is the plot for the animated results | |
### Each frame is a fit for a different polynomial | |
### You need to install ImageMagik for the gif output to work | |
### BEWARE - the geom_text REALLY slows things down! | |
tt_plot_dat <- rbind(train_result, test_result) | |
p <- ggplot(data = tt_plot_dat) + | |
geom_point(aes(x = x, y = y), color = "gray30", alpha = 0.7) + | |
geom_line(aes(x = x, y = yhat, frame = as.factor(poly)), color = "red1", size = 0.75) + | |
geom_text(aes(x = 80, y = 85, frame = as.factor(poly), | |
label = paste0("RMSE = ", RMSE))) + | |
theme_bw() + | |
facet_grid(dataset ~ .) + | |
ggtitle("Training and Test RMSE by Polynomial Degree = ") + | |
xlab("X Variable") + | |
ylab("Y Value") + | |
scale_x_continuous(breaks=seq(0, 100, 10)) + | |
scale_y_continuous(breaks=seq(50, 95, 10), limits =c(50,95)) + | |
theme( | |
legend.position = "none", | |
strip.background = element_rect(colour = "gray30", fill = "white",size = 0), | |
strip.text.y = element_text(face = "bold"), | |
panel.grid = element_line(colour = "gray70") | |
) | |
# print the composite graphic to check it out | |
print(p) | |
# this saves it out to the filename | |
gg_animate(p, "trainv test.gif", interval = 0.5) | |
### Plot for aggregate train and test errors from all z outer loops | |
err_plot_dat <- melt(plot_dat, id.vars = c("Degree", "z")) | |
err_mean <- group_by(err_plot_dat, variable, Degree) %>% | |
summarise(err_mean = mean(value)) | |
ggplot(data = err_plot_dat, aes(x = Degree, y = value, | |
group=interaction(z, variable) | |
, color = variable)) + | |
geom_line(size = 1, alpha = 0.05) + | |
geom_line(data = err_mean, aes(x = Degree, y = err_mean,color = variable), | |
size = 1.5) + | |
theme_bw() + | |
scale_color_manual(values = c("firebrick", "black")) + | |
scale_x_continuous(breaks=seq(0, 20, 1)) + | |
scale_y_continuous(breaks=seq(2, 5, 0.5), limits =c(2,5)) + | |
xlab("Polynomial Degrees (Model Complexity)") + | |
ylab("RMSE") + | |
ggtitle("Test Versus Training Error over 100 Repeated Samples") + | |
theme( | |
panel.grid = element_blank() | |
) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment