Instantly share code, notes, and snippets.

Embed
What would you like to do?
Code from my blog post on model overfitting. Code generates simulated data, has an inner loop that fits that data given a range of polynomials in lm(), and nests that in a loop that does it over a bunch of simulated data sets. There are two plots, 1) an animated gif of the fit line for each polynomial on the train and test set and 2) a overall c…
# function to generate samples of data
get_data <- function(n, exp_i, intercept, beta1, sigma, plot = TRUE){
library(lattice)
x <- sample(seq(1,100,0.1), n, replace = TRUE)
gen_model <- intercept + beta1 * x^(exp_i)
y <- rnorm(n, gen_model, sigma)
dat <- data.frame(x, y)
if(isTRUE(plot)){
print(xyplot(y~x, type=c("smooth", "p"),col.line = "darkorange", lwd = 2))
}
return(dat)
}
# prediction error function
RMSE <- function(y, yhat){
sqrt(mean((y-yhat)^2))
}
# libraries
require(ggplot2) # plotting
require(gganimate) # for the animation
require(reshape2) # for melt()
require(dplyr) # for some data munging
# fixed parameters
n <- 100 # number of samples
exp_i <- (-0.3) # model exponent
x_sim <- seq(1,100,1) # range of x variable
poly_range <- seq(1,20,1) # range of polynomials to fit
# number of times to rerun process for aggregate plot
repeats <- 100
#random parameters
intercept <- rnorm(1,50,1) # model intercept
beta1 <- rnorm(1,30,1) # model beta
sigma <- rnorm(1,2.5,0.1) # sigma
plot_dat <- NULL
# outer loop for re-running simulation for second plot
for(z in 1:repeats){
print(z)
# grab samples for training and testing
dat_train <- get_data(n, exp_i, intercept, beta1, e, plot = FALSE)
dat_test <- get_data(500, exp_i, intercept, beta1, e, plot = FALSE)
train_result <- NULL
train_err <- NULL
test_result <- NULL
test_err <- NULL
# inner loop that runs the model for each of the polynomial fits
for(i in seq_along(poly_range)){
# training set model fit
mod <- lm(y ~ poly(x,poly_range[i], raw=TRUE), data = dat_train)
# training set prediction
pred_trn <- predict(mod)
# calc training error
trn_err <- round(RMSE(dat_train$y, pred_trn),3)
# build out the results
train_err <- c(train_err, trn_err)
pred_train <- data.frame(yhat = pred_trn, dat_train, RMSE = trn_err,
poly = poly_range[i], dataset = "Training")
train_result <- rbind(train_result, pred_train)
# testing set prediction using model above
pred_tst <- predict(mod, newdata = dat_test)
# calc testing error
tst_err <- round(RMSE(dat_test$y, pred_tst),3)
# build out testing results
test_err <- c(test_err, tst_err)
pred_test <- data.frame(yhat = pred_tst, dat_test, RMSE = tst_err,
poly = poly_range[i], dataset = "Testing")
test_result <- rbind(test_result, pred_test)
}
# aggregate results from loop above
err_plot_dat <- data.frame(Training = train_err, z = z,
Testing = test_err, Degree = poly_range)
plot_dat <- rbind(plot_dat, err_plot_dat)
}
### This is the plot for the animated results
### Each frame is a fit for a different polynomial
### You need to install ImageMagik for the gif output to work
### BEWARE - the geom_text REALLY slows things down!
tt_plot_dat <- rbind(train_result, test_result)
p <- ggplot(data = tt_plot_dat) +
geom_point(aes(x = x, y = y), color = "gray30", alpha = 0.7) +
geom_line(aes(x = x, y = yhat, frame = as.factor(poly)), color = "red1", size = 0.75) +
geom_text(aes(x = 80, y = 85, frame = as.factor(poly),
label = paste0("RMSE = ", RMSE))) +
theme_bw() +
facet_grid(dataset ~ .) +
ggtitle("Training and Test RMSE by Polynomial Degree = ") +
xlab("X Variable") +
ylab("Y Value") +
scale_x_continuous(breaks=seq(0, 100, 10)) +
scale_y_continuous(breaks=seq(50, 95, 10), limits =c(50,95)) +
theme(
legend.position = "none",
strip.background = element_rect(colour = "gray30", fill = "white",size = 0),
strip.text.y = element_text(face = "bold"),
panel.grid = element_line(colour = "gray70")
)
# print the composite graphic to check it out
print(p)
# this saves it out to the filename
gg_animate(p, "trainv test.gif", interval = 0.5)
### Plot for aggregate train and test errors from all z outer loops
err_plot_dat <- melt(plot_dat, id.vars = c("Degree", "z"))
err_mean <- group_by(err_plot_dat, variable, Degree) %>%
summarise(err_mean = mean(value))
ggplot(data = err_plot_dat, aes(x = Degree, y = value,
group=interaction(z, variable)
, color = variable)) +
geom_line(size = 1, alpha = 0.05) +
geom_line(data = err_mean, aes(x = Degree, y = err_mean,color = variable),
size = 1.5) +
theme_bw() +
scale_color_manual(values = c("firebrick", "black")) +
scale_x_continuous(breaks=seq(0, 20, 1)) +
scale_y_continuous(breaks=seq(2, 5, 0.5), limits =c(2,5)) +
xlab("Polynomial Degrees (Model Complexity)") +
ylab("RMSE") +
ggtitle("Test Versus Training Error over 100 Repeated Samples") +
theme(
panel.grid = element_blank()
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment