Skip to content

Instantly share code, notes, and snippets.

@eric-pedersen
Last active October 19, 2016 19:55
Show Gist options
  • Save eric-pedersen/34024b8be884562da266ac60e1042dc5 to your computer and use it in GitHub Desktop.
Save eric-pedersen/34024b8be884562da266ac60e1042dc5 to your computer and use it in GitHub Desktop.
I wanted to see how the choice of midpoint in a circular cubic smooth in mgcv affected model fit. When fitting monthly models, we know that the break between month 1 &12 should occur between them (not right at the values). How does the choice of midpoint affect bias/variance of the fits?
library(dplyr)
library(tidyr)
library(ggplot2)
library(mgcv)
library(purrr)
nreps = 100
#Simulate data from a sinusoidal function. Point of equality should be at
#0/12 (or 0.5/12.5, or 1/13, etc.) by design.
sims = crossing(x= 1:12, rep = 1:nreps)%>%
mutate(y= 4*sin(2*pi*x/12)+rnorm(nreps,0,1))
# Calculates 3 measures: the standard error of prediction at 1 and 12 months,
# and the st. dev. of misfit from the true function, averaged over all 12 months
calc_se_values = function(x){
test_dat = data_frame(x=1:12)
prediction = predict.gam(x, test_dat, type="response",se.fit=T)
se_val = as.numeric(prediction$se.fit)
bias = as.numeric(sd(prediction$fit - 4*sin(2*pi*test_dat$x/12)))
return(data_frame(`1`= se_val[1], `12` = se_val[12],
bias= mean(bias)))
}
# The naive model, where no knots are specified. Split is assumed to occur at
# 1/12
naive_split = gam(y~s(x,bs="cc"),data = sims)
naive_fit= calc_se_values(naive_split)
naive_se = naive_fit$`1`
naive_bias = naive_fit$bias[1]
#Putting the splitpoint at a range of values, from 0/12 to 1/13 then calculating
#summary statistics
split_data = data_frame(split = seq(0,1,length= 11))%>%
group_by(split)%>%
do(mod = gam(y~s(x, bs="cc"), data=sims,
knots = list(x= c(1-.$split[1],
12+(1-.$split[1])))))%>%
do(se_val = calc_se_values(.$mod))%>%
ungroup()%>%
mutate(split = seq(0,1,length= 11))%>%
unnest(se_val)%>%
mutate(midpoint = paste(round(split,2), round(split,2)+12, sep="/"))
#Combining all the se data into a data frame for plotting in ggplot2
se_data = split_data %>%
select(midpoint,split, `1`,`12`)%>%
gather(month, se, `1`,`12`)
# plot of se of first and last months vs. the midpoint. horizontal line indicates
# the se for the naive model (which is equal for month 1&12)
se_plot = qplot(split, se, color=factor(month), data=se_data)+
scale_colour_brewer("month",palette="Set1")+
scale_x_continuous("split point", breaks=split_data$split, labels=split_data$midpoint)+
scale_y_continuous("pointwise standard error")+
theme_bw()+
geom_hline(aes(yintercept=naive_se))
print(se_plot)
# plot of total bias across the function. Note I'm not including the bias for the
# naive model, as it's always a lot higher than the accurate model, and swamps
# the variation
bias_plot = qplot(split, bias, data=split_data)+
scale_x_continuous("split point", breaks=split_data$split, labels=split_data$midpoint)+
scale_y_continuous("bias from true value")+
theme_bw()
#geom_hline(aes(yintercept=naive_bias))
print(bias_plot)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment