Last active
October 19, 2016 19:55
-
-
Save eric-pedersen/34024b8be884562da266ac60e1042dc5 to your computer and use it in GitHub Desktop.
I wanted to see how the choice of midpoint in a circular cubic smooth in mgcv affected model fit. When fitting monthly models, we know that the break between month 1 &12 should occur between them (not right at the values). How does the choice of midpoint affect bias/variance of the fits?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(tidyr) | |
library(ggplot2) | |
library(mgcv) | |
library(purrr) | |
nreps = 100 | |
#Simulate data from a sinusoidal function. Point of equality should be at | |
#0/12 (or 0.5/12.5, or 1/13, etc.) by design. | |
sims = crossing(x= 1:12, rep = 1:nreps)%>% | |
mutate(y= 4*sin(2*pi*x/12)+rnorm(nreps,0,1)) | |
# Calculates 3 measures: the standard error of prediction at 1 and 12 months, | |
# and the st. dev. of misfit from the true function, averaged over all 12 months | |
calc_se_values = function(x){ | |
test_dat = data_frame(x=1:12) | |
prediction = predict.gam(x, test_dat, type="response",se.fit=T) | |
se_val = as.numeric(prediction$se.fit) | |
bias = as.numeric(sd(prediction$fit - 4*sin(2*pi*test_dat$x/12))) | |
return(data_frame(`1`= se_val[1], `12` = se_val[12], | |
bias= mean(bias))) | |
} | |
# The naive model, where no knots are specified. Split is assumed to occur at | |
# 1/12 | |
naive_split = gam(y~s(x,bs="cc"),data = sims) | |
naive_fit= calc_se_values(naive_split) | |
naive_se = naive_fit$`1` | |
naive_bias = naive_fit$bias[1] | |
#Putting the splitpoint at a range of values, from 0/12 to 1/13 then calculating | |
#summary statistics | |
split_data = data_frame(split = seq(0,1,length= 11))%>% | |
group_by(split)%>% | |
do(mod = gam(y~s(x, bs="cc"), data=sims, | |
knots = list(x= c(1-.$split[1], | |
12+(1-.$split[1])))))%>% | |
do(se_val = calc_se_values(.$mod))%>% | |
ungroup()%>% | |
mutate(split = seq(0,1,length= 11))%>% | |
unnest(se_val)%>% | |
mutate(midpoint = paste(round(split,2), round(split,2)+12, sep="/")) | |
#Combining all the se data into a data frame for plotting in ggplot2 | |
se_data = split_data %>% | |
select(midpoint,split, `1`,`12`)%>% | |
gather(month, se, `1`,`12`) | |
# plot of se of first and last months vs. the midpoint. horizontal line indicates | |
# the se for the naive model (which is equal for month 1&12) | |
se_plot = qplot(split, se, color=factor(month), data=se_data)+ | |
scale_colour_brewer("month",palette="Set1")+ | |
scale_x_continuous("split point", breaks=split_data$split, labels=split_data$midpoint)+ | |
scale_y_continuous("pointwise standard error")+ | |
theme_bw()+ | |
geom_hline(aes(yintercept=naive_se)) | |
print(se_plot) | |
# plot of total bias across the function. Note I'm not including the bias for the | |
# naive model, as it's always a lot higher than the accurate model, and swamps | |
# the variation | |
bias_plot = qplot(split, bias, data=split_data)+ | |
scale_x_continuous("split point", breaks=split_data$split, labels=split_data$midpoint)+ | |
scale_y_continuous("bias from true value")+ | |
theme_bw() | |
#geom_hline(aes(yintercept=naive_bias)) | |
print(bias_plot) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment