mrecos/Test_vs_Train_error.R

## Test_vs_Train_error.R
# function to generate samples of data
get_data <- function(n, exp_i, intercept, beta1, sigma, plot = TRUE){
  library(lattice)
  x <- sample(seq(1,100,0.1), n, replace = TRUE)
  gen_model <- intercept + beta1 * x^(exp_i)
  y <- rnorm(n, gen_model, sigma)
  dat <- data.frame(x, y)
  if(isTRUE(plot)){
    print(xyplot(y~x, type=c("smooth", "p"),col.line = "darkorange", lwd = 2))
  }
  return(dat)
}

# prediction error function
RMSE <- function(y, yhat){
  sqrt(mean((y-yhat)^2))
}

# libraries
require(ggplot2) # plotting
require(gganimate) # for the animation
require(reshape2) # for melt()
require(dplyr) # for some data munging

# fixed parameters
n <- 100 # number of samples
exp_i <- (-0.3) # model exponent
x_sim <- seq(1,100,1) # range of x variable
poly_range <- seq(1,20,1) # range of polynomials to fit
# number of times to rerun process for aggregate plot
repeats <- 100

#random parameters
intercept <- rnorm(1,50,1) # model intercept
beta1 <- rnorm(1,30,1) # model beta
sigma <- rnorm(1,2.5,0.1) # sigma

plot_dat <- NULL
# outer loop for re-running simulation for second plot
for(z in 1:repeats){
  print(z)
  # grab samples for training and testing
  dat_train <- get_data(n, exp_i, intercept, beta1, e, plot = FALSE)
  dat_test <- get_data(500, exp_i, intercept, beta1, e, plot = FALSE)

  train_result <- NULL
  train_err <- NULL
  test_result <- NULL
  test_err <- NULL
  # inner loop that runs the model for each of the polynomial fits
  for(i in seq_along(poly_range)){
    # training set model fit
    mod <- lm(y ~ poly(x,poly_range[i], raw=TRUE), data = dat_train)
    # training set prediction
    pred_trn <- predict(mod)
    # calc training error
    trn_err <- round(RMSE(dat_train$y, pred_trn),3)
    # build out the results
    train_err <- c(train_err, trn_err)
    pred_train <- data.frame(yhat = pred_trn, dat_train, RMSE = trn_err,
                             poly = poly_range[i], dataset = "Training")
    train_result <- rbind(train_result, pred_train)
    # testing set prediction using model above
    pred_tst <- predict(mod, newdata = dat_test)
    # calc testing error
    tst_err <- round(RMSE(dat_test$y, pred_tst),3)
    # build out testing results
    test_err <- c(test_err, tst_err)
    pred_test <- data.frame(yhat = pred_tst, dat_test, RMSE = tst_err,
                            poly = poly_range[i], dataset = "Testing")
    test_result <- rbind(test_result, pred_test)
  }
  # aggregate results from loop above
  err_plot_dat <- data.frame(Training = train_err, z = z,
                             Testing = test_err, Degree = poly_range)
  plot_dat <- rbind(plot_dat, err_plot_dat)
}

### This is the plot for the animated results
### Each frame is a fit for a different polynomial
### You need to install ImageMagik for the gif output to work
### BEWARE - the geom_text REALLY slows things down!
tt_plot_dat <- rbind(train_result, test_result)
p <- ggplot(data = tt_plot_dat) +
  geom_point(aes(x = x, y = y), color = "gray30", alpha = 0.7) +
  geom_line(aes(x = x, y = yhat, frame = as.factor(poly)), color = "red1", size = 0.75) +
  geom_text(aes(x = 80, y = 85, frame = as.factor(poly),
  label = paste0("RMSE = ", RMSE))) +
  theme_bw() +
  facet_grid(dataset ~ .) +
  ggtitle("Training and Test RMSE by Polynomial Degree = ") +
  xlab("X Variable") +
  ylab("Y Value") +
  scale_x_continuous(breaks=seq(0, 100, 10))  +
  scale_y_continuous(breaks=seq(50, 95, 10), limits =c(50,95)) +
  theme(
    legend.position = "none",
    strip.background = element_rect(colour = "gray30", fill = "white",size = 0),
    strip.text.y = element_text(face = "bold"),
    panel.grid = element_line(colour = "gray70")
  )

# print the composite graphic to check it out
print(p)
# this saves it out to the filename
gg_animate(p, "trainv test.gif", interval = 0.5)

### Plot for aggregate train and test errors from all z outer loops
err_plot_dat <- melt(plot_dat, id.vars = c("Degree", "z"))
err_mean <- group_by(err_plot_dat, variable, Degree) %>%
  summarise(err_mean = mean(value))
ggplot(data = err_plot_dat, aes(x = Degree, y = value,
                         group=interaction(z, variable)
                         , color = variable)) +
  geom_line(size = 1, alpha = 0.05) +
  geom_line(data = err_mean, aes(x = Degree, y = err_mean,color = variable),
            size = 1.5) +
  theme_bw() +
  scale_color_manual(values = c("firebrick", "black")) +
  scale_x_continuous(breaks=seq(0, 20, 1))  +
  scale_y_continuous(breaks=seq(2, 5, 0.5), limits =c(2,5)) +
  xlab("Polynomial Degrees (Model Complexity)") +
  ylab("RMSE") +
  ggtitle("Test Versus Training Error over 100 Repeated Samples") +
  theme(
    panel.grid = element_blank()
  )
	# function to generate samples of data
	get_data <- function(n, exp_i, intercept, beta1, sigma, plot = TRUE){
	library(lattice)
	x <- sample(seq(1,100,0.1), n, replace = TRUE)
	gen_model <- intercept + beta1 * x^(exp_i)
	y <- rnorm(n, gen_model, sigma)
	dat <- data.frame(x, y)
	if(isTRUE(plot)){
	print(xyplot(y~x, type=c("smooth", "p"),col.line = "darkorange", lwd = 2))
	}
	return(dat)
	}

	# prediction error function
	RMSE <- function(y, yhat){
	sqrt(mean((y-yhat)^2))
	}

	# libraries
	require(ggplot2) # plotting
	require(gganimate) # for the animation
	require(reshape2) # for melt()
	require(dplyr) # for some data munging

	# fixed parameters
	n <- 100 # number of samples
	exp_i <- (-0.3) # model exponent
	x_sim <- seq(1,100,1) # range of x variable
	poly_range <- seq(1,20,1) # range of polynomials to fit
	# number of times to rerun process for aggregate plot
	repeats <- 100

	#random parameters
	intercept <- rnorm(1,50,1) # model intercept
	beta1 <- rnorm(1,30,1) # model beta
	sigma <- rnorm(1,2.5,0.1) # sigma

	plot_dat <- NULL
	# outer loop for re-running simulation for second plot
	for(z in 1:repeats){
	print(z)
	# grab samples for training and testing
	dat_train <- get_data(n, exp_i, intercept, beta1, e, plot = FALSE)
	dat_test <- get_data(500, exp_i, intercept, beta1, e, plot = FALSE)

	train_result <- NULL
	train_err <- NULL
	test_result <- NULL
	test_err <- NULL
	# inner loop that runs the model for each of the polynomial fits
	for(i in seq_along(poly_range)){
	# training set model fit
	mod <- lm(y ~ poly(x,poly_range[i], raw=TRUE), data = dat_train)
	# training set prediction
	pred_trn <- predict(mod)
	# calc training error
	trn_err <- round(RMSE(dat_train$y, pred_trn),3)
	# build out the results
	train_err <- c(train_err, trn_err)
	pred_train <- data.frame(yhat = pred_trn, dat_train, RMSE = trn_err,
	poly = poly_range[i], dataset = "Training")
	train_result <- rbind(train_result, pred_train)
	# testing set prediction using model above
	pred_tst <- predict(mod, newdata = dat_test)
	# calc testing error
	tst_err <- round(RMSE(dat_test$y, pred_tst),3)
	# build out testing results
	test_err <- c(test_err, tst_err)
	pred_test <- data.frame(yhat = pred_tst, dat_test, RMSE = tst_err,
	poly = poly_range[i], dataset = "Testing")
	test_result <- rbind(test_result, pred_test)
	}
	# aggregate results from loop above
	err_plot_dat <- data.frame(Training = train_err, z = z,
	Testing = test_err, Degree = poly_range)
	plot_dat <- rbind(plot_dat, err_plot_dat)
	}

	### This is the plot for the animated results
	### Each frame is a fit for a different polynomial
	### You need to install ImageMagik for the gif output to work
	### BEWARE - the geom_text REALLY slows things down!
	tt_plot_dat <- rbind(train_result, test_result)
	p <- ggplot(data = tt_plot_dat) +
	geom_point(aes(x = x, y = y), color = "gray30", alpha = 0.7) +
	geom_line(aes(x = x, y = yhat, frame = as.factor(poly)), color = "red1", size = 0.75) +
	geom_text(aes(x = 80, y = 85, frame = as.factor(poly),
	label = paste0("RMSE = ", RMSE))) +
	theme_bw() +
	facet_grid(dataset ~ .) +
	ggtitle("Training and Test RMSE by Polynomial Degree = ") +
	xlab("X Variable") +
	ylab("Y Value") +
	scale_x_continuous(breaks=seq(0, 100, 10)) +
	scale_y_continuous(breaks=seq(50, 95, 10), limits =c(50,95)) +
	theme(
	legend.position = "none",
	strip.background = element_rect(colour = "gray30", fill = "white",size = 0),
	strip.text.y = element_text(face = "bold"),
	panel.grid = element_line(colour = "gray70")
	)

	# print the composite graphic to check it out
	print(p)
	# this saves it out to the filename
	gg_animate(p, "trainv test.gif", interval = 0.5)

	### Plot for aggregate train and test errors from all z outer loops
	err_plot_dat <- melt(plot_dat, id.vars = c("Degree", "z"))
	err_mean <- group_by(err_plot_dat, variable, Degree) %>%
	summarise(err_mean = mean(value))
	ggplot(data = err_plot_dat, aes(x = Degree, y = value,
	group=interaction(z, variable)
	, color = variable)) +
	geom_line(size = 1, alpha = 0.05) +
	geom_line(data = err_mean, aes(x = Degree, y = err_mean,color = variable),
	size = 1.5) +
	theme_bw() +
	scale_color_manual(values = c("firebrick", "black")) +
	scale_x_continuous(breaks=seq(0, 20, 1)) +
	scale_y_continuous(breaks=seq(2, 5, 0.5), limits =c(2,5)) +
	xlab("Polynomial Degrees (Model Complexity)") +
	ylab("RMSE") +
	ggtitle("Test Versus Training Error over 100 Repeated Samples") +
	theme(
	panel.grid = element_blank()
	)