eric-pedersen/gam_cc_midpoint_test.R

## gam_cc_midpoint_test.R
library(dplyr)
library(tidyr)
library(ggplot2)
library(mgcv)
library(purrr)
nreps = 100
#Simulate data from a sinusoidal function. Point of equality should be at
#0/12 (or 0.5/12.5, or 1/13, etc.) by design.
sims = crossing(x= 1:12, rep = 1:nreps)%>%
  mutate(y= 4*sin(2*pi*x/12)+rnorm(nreps,0,1))

# Calculates 3 measures: the standard error of prediction at 1 and 12 months,
# and the st. dev. of misfit from the true function, averaged over all 12 months
calc_se_values = function(x){
  test_dat = data_frame(x=1:12)
  prediction = predict.gam(x, test_dat, type="response",se.fit=T)
  se_val = as.numeric(prediction$se.fit)
  bias = as.numeric(sd(prediction$fit - 4*sin(2*pi*test_dat$x/12)))
  return(data_frame(`1`= se_val[1], `12` = se_val[12],
                    bias= mean(bias)))
}

# The naive model, where no knots are specified. Split is assumed to occur at
# 1/12
naive_split = gam(y~s(x,bs="cc"),data = sims)
naive_fit= calc_se_values(naive_split)
naive_se = naive_fit$`1`
naive_bias = naive_fit$bias[1]

#Putting the splitpoint at a range of values, from 0/12 to 1/13 then calculating
#summary statistics

split_data = data_frame(split = seq(0,1,length= 11))%>%
  group_by(split)%>%
  do(mod = gam(y~s(x, bs="cc"), data=sims,
                             knots = list(x= c(1-.$split[1],
                                               12+(1-.$split[1])))))%>%
  do(se_val = calc_se_values(.$mod))%>%
  ungroup()%>%
  mutate(split = seq(0,1,length= 11))%>%
  unnest(se_val)%>%
  mutate(midpoint = paste(round(split,2), round(split,2)+12, sep="/"))

#Combining all the se data into a data frame for plotting in ggplot2
se_data = split_data  %>%
  select(midpoint,split, `1`,`12`)%>%
  gather(month, se, `1`,`12`)

# plot of se of first and last months vs. the midpoint. horizontal line indicates
# the se for the naive model (which is equal for month 1&12)
se_plot = qplot(split, se, color=factor(month), data=se_data)+
  scale_colour_brewer("month",palette="Set1")+
  scale_x_continuous("split point", breaks=split_data$split, labels=split_data$midpoint)+
  scale_y_continuous("pointwise standard error")+
  theme_bw()+
  geom_hline(aes(yintercept=naive_se))

print(se_plot)

# plot of total bias across the function. Note I'm not including the bias for the
# naive model, as it's always a lot higher than the accurate model, and swamps
# the variation
bias_plot = qplot(split, bias,  data=split_data)+
  scale_x_continuous("split point", breaks=split_data$split, labels=split_data$midpoint)+
  scale_y_continuous("bias from true value")+
  theme_bw()
  #geom_hline(aes(yintercept=naive_bias))

print(bias_plot)
	library(dplyr)
	library(tidyr)
	library(ggplot2)
	library(mgcv)
	library(purrr)
	nreps = 100
	#Simulate data from a sinusoidal function. Point of equality should be at
	#0/12 (or 0.5/12.5, or 1/13, etc.) by design.
	sims = crossing(x= 1:12, rep = 1:nreps)%>%
	mutate(y= 4sin(2pi*x/12)+rnorm(nreps,0,1))

	# Calculates 3 measures: the standard error of prediction at 1 and 12 months,
	# and the st. dev. of misfit from the true function, averaged over all 12 months
	calc_se_values = function(x){
	test_dat = data_frame(x=1:12)
	prediction = predict.gam(x, test_dat, type="response",se.fit=T)
	se_val = as.numeric(prediction$se.fit)
	bias = as.numeric(sd(prediction$fit - 4sin(2pi*test_dat$x/12)))
	return(data_frame(`1`= se_val[1], `12` = se_val[12],
	bias= mean(bias)))
	}

	# The naive model, where no knots are specified. Split is assumed to occur at
	# 1/12
	naive_split = gam(y~s(x,bs="cc"),data = sims)
	naive_fit= calc_se_values(naive_split)
	naive_se = naive_fit$`1`
	naive_bias = naive_fit$bias[1]

	#Putting the splitpoint at a range of values, from 0/12 to 1/13 then calculating
	#summary statistics

	split_data = data_frame(split = seq(0,1,length= 11))%>%
	group_by(split)%>%
	do(mod = gam(y~s(x, bs="cc"), data=sims,
	knots = list(x= c(1-.$split[1],
	12+(1-.$split[1])))))%>%
	do(se_val = calc_se_values(.$mod))%>%
	ungroup()%>%
	mutate(split = seq(0,1,length= 11))%>%
	unnest(se_val)%>%
	mutate(midpoint = paste(round(split,2), round(split,2)+12, sep="/"))

	#Combining all the se data into a data frame for plotting in ggplot2
	se_data = split_data %>%
	select(midpoint,split, `1`,`12`)%>%
	gather(month, se, `1`,`12`)

	# plot of se of first and last months vs. the midpoint. horizontal line indicates
	# the se for the naive model (which is equal for month 1&12)
	se_plot = qplot(split, se, color=factor(month), data=se_data)+
	scale_colour_brewer("month",palette="Set1")+
	scale_x_continuous("split point", breaks=split_data$split, labels=split_data$midpoint)+
	scale_y_continuous("pointwise standard error")+
	theme_bw()+
	geom_hline(aes(yintercept=naive_se))

	print(se_plot)

	# plot of total bias across the function. Note I'm not including the bias for the
	# naive model, as it's always a lot higher than the accurate model, and swamps
	# the variation
	bias_plot = qplot(split, bias, data=split_data)+
	scale_x_continuous("split point", breaks=split_data$split, labels=split_data$midpoint)+
	scale_y_continuous("bias from true value")+
	theme_bw()
	#geom_hline(aes(yintercept=naive_bias))

	print(bias_plot)