Skip to content

Instantly share code, notes, and snippets.

@dggoldst
Created October 15, 2014 18:54
Show Gist options
  • Save dggoldst/a0dba9dcaec070649ffa to your computer and use it in GitHub Desktop.
Save dggoldst/a0dba9dcaec070649ffa to your computer and use it in GitHub Desktop.
library(ggplot2)
library(dplyr)
AGEMIN=30
AGEMAX=110
NR=AGEMAX-AGEMIN+1
best_ABSDEV=1e6
#If you want the original data, you can get it from
#http://www.ssa.gov/OACT/STATS/table4c6.html
#And prepare it as follows.
#df=read.delim("lifex_2010.txt")
#df = df %>%
# filter(age>=AGEMIN & age <=AGEMAX) %>%
# mutate(lifex=(m_lifex+f_lifex)/2) %>% #Make a unisex life expectancy
# select(age,lifex)
#Or you can just work with the cleaned data, here:
df=structure(list(age = 30:110, lifex = c(49.8, 48.85, 47.9, 46.95,
46.005, 45.06, 44.11, 43.165, 42.225, 41.285, 40.345, 39.415,
38.48, 37.56, 36.64, 35.725, 34.82, 33.92, 33.025, 32.14, 31.26,
30.39, 29.525, 28.665, 27.81, 26.97, 26.125, 25.3, 24.47, 23.655,
22.84, 22.03, 21.23, 20.44, 19.655, 18.885, 18.12, 17.375, 16.635,
15.915, 15.2, 14.495, 13.81, 13.14, 12.475, 11.83, 11.205, 10.59,
10, 9.42, 8.855, 8.315, 7.79, 7.29, 6.81, 6.345, 5.91, 5.495,
5.1, 4.735, 4.395, 4.075, 3.785, 3.52, 3.28, 3.065, 2.875, 2.705,
2.55, 2.41, 2.275, 2.15, 2.025, 1.905, 1.795, 1.69, 1.585, 1.485,
1.385, 1.295, 1.215)), class = "data.frame", row.names = c(NA,
-81L), .Names = c("age", "lifex"))
### The best simple linear model
#For ages 30-110, the best fitting simple linear model is
#int=63.3, age=-.64
bestlinear_model=lm(data=df,lifex~age)
### The best second-order polynomial model
#For ages 30-110, the best fitting simple linear model is
#int=92.1,
#age=-1.56
#age^2=.007
bestpoly_model=lm(data=df,lifex~age+I(age^2))
### The best bi-linear model
#For ages 30-110, the best fitting bi-linear model has
#int1=73.84 slope1= -0.84, int2= 27.63, slope2=-0.25
#with a new line starting at age 80
errorlist=vector('list',nrow(df))
for(cut in (5):(nrow(df)-5)){
#There are slicker ways to do this, but this is easier to understand I think
#Define dummy variables for interecepts 1 and 2 and slopes 1 and 2
df=mutate(df,
i1=c(rep(1,cut),rep(0,AGEMAX-cut-AGEMIN+1)),
s1=i1*age,
i2=c(rep(0,cut),rep(1,AGEMAX-cut-AGEMIN+1)),
s2=i2*age
)
#Fit a candidate model to the dummies
candidate_bilinear_model=lm(data=df,lifex~0+i1+s1+i2+s2)
#Record the coefficients and predictions for the model for selection later
df$bilinear=predict(candidate_bilinear_model)
current_ABSDEV=with(df,sum(abs(bilinear-lifex))/nrow(df))
#Save the predictions of the best model
if(current_ABSDEV<best_ABSDEV){
best_ABSDEV=current_ABSDEV
best_bilinear_preds = df$bilinear
best_cut=cut}
#Keep a log of the various model coefficients and errors
errorlist[[cut]]=data.frame(cut=cut,
ABSDEV=current_ABSDEV,
coef_i1=round(candidate_bilinear_model$coef[1],2),
coef_s1=round(candidate_bilinear_model$coef[2],2),
coef_i2=round(candidate_bilinear_model$coef[3],2),
coef_s2=round(candidate_bilinear_model$coef[4],2))
}
errdat=do.call("rbind",errorlist)
#Plot to show we found the model with the lowest ABSDEV (in errdat at best_cut)
#Need to add AGEMIN to this to find the year at which to start the second line
p=ggplot(errdat,aes(x=cut,y=ABSDEV))
p=p+geom_point()
p
ggsave(plot=p,file="ABSDEV_minimizing.png",height=4,width=4)
### Heuristic bi-linear model
#Here's an approximation with rounder / easier to apply numbers
#If you're under 85, it's
#72 minus 80% of your age
#Otherwise it's
#22 minus 20% of your age
heuristic_bilinear_model=Vectorize(function(age){
if(age<85) {72-.8*age}
else {22-.2*age}
})
#The 50-15-5 model
#For ages 30, 70, 90, 110
#the respective life-expectancies are
#50, 15, 5, 0
#Go forth and interpolate
model_50_15_5=Vectorize(function(age){
if(age<70) {50-(age-30)*35/40}
else if(age<90) {15-(age-70)*10/20}
else {5-(age-90)*5/20}
})
### Doctor's heuristic
#100 minus your age divided by two
doctors=function(age) {(100-age)/2}
### Make predictions for each model
pred_dat = data.frame(
Age=df$age,
lifex=df$lifex,
Model=c(rep("Best Linear",NR),
rep("Best Polynomial",NR),
rep("Best Bi-linear",NR),
rep("Heuristic Bilinear",NR),
rep("50 15 5 Heuristic",NR),
rep("Doctors",NR)
),
Predicted_Life_Expectancy=c(predict(bestlinear_model),
predict(bestpoly_model),
best_bilinear_preds,
heuristic_bilinear_model(df$age),
model_50_15_5(df$age),
doctors(df$age)
))
###Error for each model
pred_dat = pred_dat %>%
mutate(ABSDEV=abs(lifex-Predicted_Life_Expectancy))
p=ggplot(subset(pred_dat,Model %in% c("Doctors","Best Linear")),
aes(x=Age,y=Predicted_Life_Expectancy,group=Model,color=Model))
p=p+geom_point(color="black",aes(y=lifex))
p=p+geom_line(size=1.25)
p=p+theme(legend.position="bottom")
p=p+scale_x_continuous(breaks=seq(0,110,10))
p=p+scale_y_continuous(breaks=seq(-10,110,10))
p=p+scale_color_brewer(palette="Set1")
p
ggsave(plot=p,file="linear.png",height=4,width=6)
p=ggplot(subset(pred_dat,Model %in% c("Best Polynomial","Best Bi-linear")),
aes(x=Age,y=Predicted_Life_Expectancy,group=Model,color=Model))
p=p+geom_point(color="black",aes(y=lifex))
p=p+geom_line(size=1.25)
p=p+theme(legend.position="bottom")
p=p+scale_x_continuous(breaks=seq(0,110,10))
p=p+scale_y_continuous(breaks=seq(-10,110,10))
p=p+scale_color_brewer(palette="Dark2")
p
ggsave(plot=p,file="fitted_nonlinear.png",height=4,width=6)
p=ggplot(subset(pred_dat,Model %in% c("Heuristic Bilinear","50 15 5 Heuristic")),
aes(x=Age,y=Predicted_Life_Expectancy,group=Model,color=Model))
p=p+geom_point(color="black",aes(y=lifex))
p=p+geom_line(size=1.25)
p=p+theme(legend.position="bottom")
p=p+scale_x_continuous(breaks=seq(0,110,10))
p=p+scale_y_continuous(breaks=seq(0,110,10))
p=p+scale_color_brewer(palette="Accent")
p
ggsave(plot=p,file="heuristic_nonlinear.png",height=4,width=6)
mad_df = pred_dat %>%
group_by(Model) %>%
summarize(MAD=mean(ABSDEV),
seMAD=sqrt(var(ABSDEV)/length(ABSDEV))) %>%
arrange(MAD)
mad_df = mad_df %>%
mutate(Model=factor(Model,levels=mad_df$Model))
p=ggplot(mad_df,aes(x=Model,y=MAD))
p=p+geom_point()
p=p+geom_errorbar(width=.2,aes(ymin=MAD-seMAD,ymax=MAD+seMAD))
p=p+theme( axis.text.x = element_text(angle=90, vjust=0.5))
p=p+ggtitle("Mean Absolute Deviation of Predictions")
p
ggsave(plot=p,file="mad_predictions.png",height=4,width=4)
#######Try it w/ death age
df = df %>% mutate(deathx=age+lifex)
p=ggplot(df,aes(x=age,y=deathx))
p=p+geom_point()
p=p+scale_x_continuous(breaks=seq(0,110,10))
p=p+scale_y_continuous(breaks=seq(0,110,10))
p
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment