Skip to content

Instantly share code, notes, and snippets.

@bennytowns
Created August 22, 2016 02:25
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bennytowns/8741bbf4898f0516fb70c0017b13be60 to your computer and use it in GitHub Desktop.
Save bennytowns/8741bbf4898f0516fb70c0017b13be60 to your computer and use it in GitHub Desktop.
library(stats)
library(VIM)
library(mice)
library(car)
library(Hmisc)
library(dplyr)
setwd('~/R')
#import and prepare the master data.
full_golf=read.csv('full_golf_new.csv',strip.white=T,stringsAsFactors=FALSE)
row.names(full_golf)=paste0(full_golf$name,full_golf$year)
full_golf=full_golf[,-which(names(full_golf) %in% c('X','name'))]
golf_fedex = full_golf[is.na(full_golf$rk_fedex)==F,]
golf_fedex$rk_fedex=golf_fedex$rk_fedex/golf_fedex$rk_fedex_variable2
summary(golf_fedex)
aggr(golf_fedex) #Lots of missing data, but is it relevant data? Get a good subset of data first
########################################################################################################
################################Shots Gained v fedex Earned#############################################
########################################################################################################
#Prepare the dataset
sg_data=golf_fedex[golf_fedex$year!=2016,c('rk_fedex','sg_ott','sg_aptg','sg_artg','sg_putt')]
sg_test=golf_fedex[golf_fedex$year==2016,c('sg_ott','sg_aptg','sg_artg','sg_putt')]
sg_test_actual=golf_fedex[golf_fedex$year==2016,c('rk_fedex','sg_ott','sg_aptg','sg_artg','sg_putt')]
sg_test_actual=sg_test_actual[which(complete.cases(sg_test_actual)),]
aggr(sg_data)
md.pattern(sg_data) #Missing Predictor Variables... not really imputable, so we won't test these, fair to drop
#because this data only recorded for top 200 players, the others will be OUTSIDE of the other data
sg_data_complete=sg_data[which(complete.cases(sg_data)),]
sg_test_complete=sg_test[which(complete.cases(sg_test)),]
aggr(sg_data_complete)
summary(sg_data_complete) #Data is prepped for testing
#Fit a multiple linear regression
sg.saturated=lm(rk_fedex~.,sg_data_complete)
summary(sg.saturated)
vif(sg.saturated)
avPlots(sg.saturated)
plot(sg.saturated)
#Better Box-Cox transform this
sg_sat.bc=boxCox(sg.saturated)
lambda = sg_sat.bc$x[which(sg_sat.bc$y == max(sg_sat.bc$y))]
sg_data_complete$rk_fedex.bc = (sg_data_complete$rk_fedex^lambda - 1)/lambda
sg_sat_mod.bc=lm(rk_fedex.bc~sg_ott +sg_aptg +sg_artg +sg_putt,sg_data_complete)
summary(sg_sat_mod.bc)
plot(sg_sat_mod.bc)
vif(sg_sat_mod.bc)
avPlots(sg_sat_mod.bc)
#Assumptions look good
#Looks like multiple linear regression model may hold.
#But, let's see if we need all of these variables...
#prepare less fit models
model.empty=lm(rk_fedex.bc~1,sg_data_complete)
model.full=lm(rk_fedex.bc~.-rk_fedex,sg_data_complete)
scope=list(lower = formula(model.empty), upper = formula(model.full))
forwardAIC = step(model.empty, scope, direction = "forward", k = 2)
backwardAIC = step(model.full, scope, direction = "backward", k = 2)
bothAIC.empty = step(model.empty, scope, direction = "both", k = 2)
bothAIC.full = step(model.full, scope, direction = "both", k = 2)
#Stepwise regression using BIC as the criteria (the penalty k = log(n)).
forwardBIC = step(model.empty, scope, direction = "forward", k = log(196))
backwardBIC = step(model.full, scope, direction = "backward", k = log(196))
bothBIC.empty = step(model.empty, scope, direction = "both", k = log(196))
bothBIC.full = step(model.full, scope, direction = "both", k = log(196))
#Looks like combination of the four stats will work fine, let's predict 2016 output
fedex_predict_2016=predict(sg_sat_mod.bc,sg_test_complete,interval='confidence')
sg_test_actual$rk_fedex.bc = (sg_test_actual$rk_fedex^lambda - 1)/lambda
sg_test_actual=cbind(sg_test_actual,fedex_predict_2016)
sg_test_actual$fit_error=sg_test_actual$rk_fedex.bc-sg_test_actual$fit
sg_test_actual$error_sq=sg_test_actual$fit_error**2
sg_total_error=sum(sg_test_actual$error_sq)
#LET'S MAKE OUR OWN MODEL
########################################################################################################
##############################Other Variables v fedex Earned############################################
########################################################################################################
#prepare a data set
raw_data=golf_fedex[golf_fedex$year!=2016,c('rk_fedex','drd','dra','gir','ssv','scr','pth','pthatg','pmd','ppr')]
raw_test=golf_fedex[golf_fedex$year==2016,c('drd','dra','gir','ssv','scr','pth','pthatg','pmd','ppr')]
raw_test_actual=golf_fedex[golf_fedex$year==2016,c('rk_fedex','drd','dra','gir','ssv','scr','pth','pthatg','pmd','ppr')]
aggr(raw_data) #Again, the lower-ranked golfers, so hard to impute, since they all fall outside of the data ranges.
raw_data=raw_data[which(complete.cases(raw_data)),]
raw_test_complete=raw_test[which(complete.cases(raw_test)),]
aggr(raw_data)
#fit multiple linear model on all data
td_mod = lm(rk_fedex~.,raw_data)
summary(td_mod)
td_mod_summary = summary(td_mod)
avPlots(td_mod)
plot(td_mod) #Definitely some relationship, but it's not really "linear", maybe we can make it so with box-cox transform
#First, let's build a model with reduced variable set and see how that looks.
##########################MAKE MY OWN REDUCED MODEL################################
td_mod_red = lm(rk_fedex~drd+dra+gir+ssv+pth+pthatg+pmd+ppr,raw_data)
summary(td_mod_red)
plot(td_mod_red)
vif(td_mod_red)
#Box Cox transform
td_mod.bc=boxCox(td_mod)
lambda = td_mod.bc$x[which(td_mod.bc$y == max(td_mod.bc$y))]
raw_data$rk_fedex.bc = (raw_data$rk_fedex^lambda - 1)/lambda
td_mod.bc=lm(rk_fedex.bc~.-rk_fedex,raw_data)
summary(td_mod.bc)
plot(td_mod.bc) #Looks like a reasonable model, let's try a reduced model based on significant variables
#Test Box Cox transformed dependent variable on the reduced variable set
td_mod_red.bc=lm(rk_fedex.bc~drd+dra+gir+ssv+pth+pthatg+pmd+ppr,raw_data)
summary(td_mod_red.bc)
plot(td_mod_red.bc)
vif(td_mod_red.bc)
avPlots(td_mod_red.bc)
#Looks like a decent model, but let's see if there's a "best model"
#Check full vs reduced
AIC(td_mod_red.bc,td_mod.bc)
BIC(td_mod_red.bc,td_mod.bc)
#AIC and BIC don't show a clear advantage
#Alternatively, let's do a stepwise regression and see what set of variables is identified
model.empty=lm(rk_fedex.bc~1,raw_data)
model.full=lm(rk_fedex.bc~.-rk_fedex,raw_data)
scope=list(lower = formula(model.empty), upper = formula(model.full))
forwardAIC = step(model.empty, scope, direction = "forward", k = 2)
backwardAIC = step(model.full, scope, direction = "backward", k = 2)
bothAIC.empty = step(model.empty, scope, direction = "both", k = 2)
bothAIC.full = step(model.full, scope, direction = "both", k = 2)
#Stepwise regression using BIC as the criteria (the penalty k = log(n)).
forwardBIC = step(model.empty, scope, direction = "forward", k = log(196))
backwardBIC = step(model.full, scope, direction = "backward", k = log(196))
bothBIC.empty = step(model.empty, scope, direction = "both", k = log(196))
bothBIC.full = step(model.full, scope, direction = "both", k = log(196))
#BIC identifies a subset, similar to the SG model statistics but includes some overlapping areas, dra, pmd
#after reviewing VIF, we can safely remove dra and pmd on concerns about collinearity. Resulting variable set follows:
td_mod_red2.bc=lm(rk_fedex.bc~drd+gir+pthatg+ppr,raw_data)
summary(td_mod_red2.bc)
#Looks like resonably good model, but check assumptions
plot(td_mod_red2.bc)
vif(td_mod_red2.bc)
avPlots(td_mod_red2.bc)
###Which Model is best?
AIC(td_mod_red2.bc,sg_sat_mod.bc)
BIC(td_mod_red2.bc,sg_sat_mod.bc)
#Test to see how well it predicts
raw_mod.predicrat=data.frame(predict(td_mod_red2.bc,raw_test_complete,interval='confidence'))
summary(raw_mod.predict)
sg_test_actual$raw_mod_predict=raw_mod.predict$fit
sg_test_actual$raw_fit_error=sg_test_actual$rk_fedex.bc-sg_test_actual$raw_mod_predict
sg_test_actual$raw_error_sq = sg_test_actual$raw_fit_error**2
#Compare the two models
mean(sg_test_actual$raw_error_sq)
sd(sg_test_actual$raw_error_sq)
mean(sg_test_actual$error_sq)
sd(sg_test_actual$error_sq)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment