Skip to content

Instantly share code, notes, and snippets.

@mrecos
Created February 17, 2016 03:02
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Embed
What would you like to do?
Code from my blog post on model overfitting. Code generates simulated data, has an inner loop that fits that data given a range of polynomials in lm(), and nests that in a loop that does it over a bunch of simulated data sets. There are two plots, 1) an animated gif of the fit line for each polynomial on the train and test set and 2) a overall c…
# function to generate samples of data
get_data <- function(n, exp_i, intercept, beta1, sigma, plot = TRUE){
library(lattice)
x <- sample(seq(1,100,0.1), n, replace = TRUE)
gen_model <- intercept + beta1 * x^(exp_i)
y <- rnorm(n, gen_model, sigma)
dat <- data.frame(x, y)
if(isTRUE(plot)){
print(xyplot(y~x, type=c("smooth", "p"),col.line = "darkorange", lwd = 2))
}
return(dat)
}
# prediction error function
RMSE <- function(y, yhat){
sqrt(mean((y-yhat)^2))
}
# libraries
require(ggplot2) # plotting
require(gganimate) # for the animation
require(reshape2) # for melt()
require(dplyr) # for some data munging
# fixed parameters
n <- 100 # number of samples
exp_i <- (-0.3) # model exponent
x_sim <- seq(1,100,1) # range of x variable
poly_range <- seq(1,20,1) # range of polynomials to fit
# number of times to rerun process for aggregate plot
repeats <- 100
#random parameters
intercept <- rnorm(1,50,1) # model intercept
beta1 <- rnorm(1,30,1) # model beta
sigma <- rnorm(1,2.5,0.1) # sigma
plot_dat <- NULL
# outer loop for re-running simulation for second plot
for(z in 1:repeats){
print(z)
# grab samples for training and testing
dat_train <- get_data(n, exp_i, intercept, beta1, e, plot = FALSE)
dat_test <- get_data(500, exp_i, intercept, beta1, e, plot = FALSE)
train_result <- NULL
train_err <- NULL
test_result <- NULL
test_err <- NULL
# inner loop that runs the model for each of the polynomial fits
for(i in seq_along(poly_range)){
# training set model fit
mod <- lm(y ~ poly(x,poly_range[i], raw=TRUE), data = dat_train)
# training set prediction
pred_trn <- predict(mod)
# calc training error
trn_err <- round(RMSE(dat_train$y, pred_trn),3)
# build out the results
train_err <- c(train_err, trn_err)
pred_train <- data.frame(yhat = pred_trn, dat_train, RMSE = trn_err,
poly = poly_range[i], dataset = "Training")
train_result <- rbind(train_result, pred_train)
# testing set prediction using model above
pred_tst <- predict(mod, newdata = dat_test)
# calc testing error
tst_err <- round(RMSE(dat_test$y, pred_tst),3)
# build out testing results
test_err <- c(test_err, tst_err)
pred_test <- data.frame(yhat = pred_tst, dat_test, RMSE = tst_err,
poly = poly_range[i], dataset = "Testing")
test_result <- rbind(test_result, pred_test)
}
# aggregate results from loop above
err_plot_dat <- data.frame(Training = train_err, z = z,
Testing = test_err, Degree = poly_range)
plot_dat <- rbind(plot_dat, err_plot_dat)
}
### This is the plot for the animated results
### Each frame is a fit for a different polynomial
### You need to install ImageMagik for the gif output to work
### BEWARE - the geom_text REALLY slows things down!
tt_plot_dat <- rbind(train_result, test_result)
p <- ggplot(data = tt_plot_dat) +
geom_point(aes(x = x, y = y), color = "gray30", alpha = 0.7) +
geom_line(aes(x = x, y = yhat, frame = as.factor(poly)), color = "red1", size = 0.75) +
geom_text(aes(x = 80, y = 85, frame = as.factor(poly),
label = paste0("RMSE = ", RMSE))) +
theme_bw() +
facet_grid(dataset ~ .) +
ggtitle("Training and Test RMSE by Polynomial Degree = ") +
xlab("X Variable") +
ylab("Y Value") +
scale_x_continuous(breaks=seq(0, 100, 10)) +
scale_y_continuous(breaks=seq(50, 95, 10), limits =c(50,95)) +
theme(
legend.position = "none",
strip.background = element_rect(colour = "gray30", fill = "white",size = 0),
strip.text.y = element_text(face = "bold"),
panel.grid = element_line(colour = "gray70")
)
# print the composite graphic to check it out
print(p)
# this saves it out to the filename
gg_animate(p, "trainv test.gif", interval = 0.5)
### Plot for aggregate train and test errors from all z outer loops
err_plot_dat <- melt(plot_dat, id.vars = c("Degree", "z"))
err_mean <- group_by(err_plot_dat, variable, Degree) %>%
summarise(err_mean = mean(value))
ggplot(data = err_plot_dat, aes(x = Degree, y = value,
group=interaction(z, variable)
, color = variable)) +
geom_line(size = 1, alpha = 0.05) +
geom_line(data = err_mean, aes(x = Degree, y = err_mean,color = variable),
size = 1.5) +
theme_bw() +
scale_color_manual(values = c("firebrick", "black")) +
scale_x_continuous(breaks=seq(0, 20, 1)) +
scale_y_continuous(breaks=seq(2, 5, 0.5), limits =c(2,5)) +
xlab("Polynomial Degrees (Model Complexity)") +
ylab("RMSE") +
ggtitle("Test Versus Training Error over 100 Repeated Samples") +
theme(
panel.grid = element_blank()
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment