bennytowns/golf_fedex_more.R

## golf_fedex_more.R
library(stats)
library(VIM)
library(mice)
library(car)
library(Hmisc)
library(dplyr)
setwd('~/R')

#import and prepare the master data.
full_golf=read.csv('full_golf_new.csv',strip.white=T,stringsAsFactors=FALSE)
row.names(full_golf)=paste0(full_golf$name,full_golf$year)
full_golf=full_golf[,-which(names(full_golf) %in% c('X','name'))]
golf_fedex = full_golf[is.na(full_golf$rk_fedex)==F,]
golf_fedex$rk_fedex=golf_fedex$rk_fedex/golf_fedex$rk_fedex_variable2
summary(golf_fedex)
aggr(golf_fedex) #Lots of missing data, but is it relevant data?  Get a good subset of data first

########################################################################################################
################################Shots Gained v fedex Earned#############################################
########################################################################################################

#Prepare the dataset
sg_data=golf_fedex[golf_fedex$year!=2016,c('rk_fedex','sg_ott','sg_aptg','sg_artg','sg_putt')]
sg_test=golf_fedex[golf_fedex$year==2016,c('sg_ott','sg_aptg','sg_artg','sg_putt')]
sg_test_actual=golf_fedex[golf_fedex$year==2016,c('rk_fedex','sg_ott','sg_aptg','sg_artg','sg_putt')]
sg_test_actual=sg_test_actual[which(complete.cases(sg_test_actual)),]
aggr(sg_data)
md.pattern(sg_data) #Missing Predictor Variables... not really imputable, so we won't test these, fair to drop
                    #because this data only recorded for top 200 players, the others will be OUTSIDE of the other data
sg_data_complete=sg_data[which(complete.cases(sg_data)),]
sg_test_complete=sg_test[which(complete.cases(sg_test)),]
aggr(sg_data_complete)
summary(sg_data_complete) #Data is prepped for testing

#Fit a multiple linear regression
sg.saturated=lm(rk_fedex~.,sg_data_complete)
summary(sg.saturated)
vif(sg.saturated)
avPlots(sg.saturated)
plot(sg.saturated)

#Better Box-Cox transform this
sg_sat.bc=boxCox(sg.saturated)
lambda = sg_sat.bc$x[which(sg_sat.bc$y == max(sg_sat.bc$y))]
sg_data_complete$rk_fedex.bc = (sg_data_complete$rk_fedex^lambda - 1)/lambda
sg_sat_mod.bc=lm(rk_fedex.bc~sg_ott +sg_aptg +sg_artg +sg_putt,sg_data_complete)
summary(sg_sat_mod.bc)
plot(sg_sat_mod.bc)
vif(sg_sat_mod.bc)
avPlots(sg_sat_mod.bc)
#Assumptions look good
#Looks like multiple linear regression model may hold.
#But, let's see if we need all of these variables...

#prepare less fit models
model.empty=lm(rk_fedex.bc~1,sg_data_complete)
model.full=lm(rk_fedex.bc~.-rk_fedex,sg_data_complete)
scope=list(lower = formula(model.empty), upper = formula(model.full))

forwardAIC = step(model.empty, scope, direction = "forward", k = 2)
backwardAIC = step(model.full, scope, direction = "backward", k = 2)
bothAIC.empty = step(model.empty, scope, direction = "both", k = 2)
bothAIC.full = step(model.full, scope, direction = "both", k = 2)

#Stepwise regression using BIC as the criteria (the penalty k = log(n)).
forwardBIC = step(model.empty, scope, direction = "forward", k = log(196))
backwardBIC = step(model.full, scope, direction = "backward", k = log(196))
bothBIC.empty = step(model.empty, scope, direction = "both", k = log(196))
bothBIC.full = step(model.full, scope, direction = "both", k = log(196))

#Looks like combination of the four stats will work fine, let's predict 2016 output
fedex_predict_2016=predict(sg_sat_mod.bc,sg_test_complete,interval='confidence')
sg_test_actual$rk_fedex.bc = (sg_test_actual$rk_fedex^lambda - 1)/lambda
sg_test_actual=cbind(sg_test_actual,fedex_predict_2016)
sg_test_actual$fit_error=sg_test_actual$rk_fedex.bc-sg_test_actual$fit
sg_test_actual$error_sq=sg_test_actual$fit_error**2
sg_total_error=sum(sg_test_actual$error_sq)


#LET'S MAKE OUR OWN MODEL

########################################################################################################
##############################Other Variables v fedex Earned############################################
########################################################################################################

#prepare a data set
raw_data=golf_fedex[golf_fedex$year!=2016,c('rk_fedex','drd','dra','gir','ssv','scr','pth','pthatg','pmd','ppr')]
raw_test=golf_fedex[golf_fedex$year==2016,c('drd','dra','gir','ssv','scr','pth','pthatg','pmd','ppr')]
raw_test_actual=golf_fedex[golf_fedex$year==2016,c('rk_fedex','drd','dra','gir','ssv','scr','pth','pthatg','pmd','ppr')]
aggr(raw_data) #Again, the lower-ranked golfers, so hard to impute, since they all fall outside of the data ranges.
raw_data=raw_data[which(complete.cases(raw_data)),]
raw_test_complete=raw_test[which(complete.cases(raw_test)),]
aggr(raw_data)

#fit multiple linear model on all data
td_mod = lm(rk_fedex~.,raw_data)
summary(td_mod)
td_mod_summary = summary(td_mod)
avPlots(td_mod)
plot(td_mod)  #Definitely some relationship, but it's not really "linear", maybe we can make it so with box-cox transform
#First, let's build a model with reduced variable set and see how that looks.

##########################MAKE MY OWN REDUCED MODEL################################
td_mod_red = lm(rk_fedex~drd+dra+gir+ssv+pth+pthatg+pmd+ppr,raw_data)
summary(td_mod_red)
plot(td_mod_red)
vif(td_mod_red)

#Box Cox transform
td_mod.bc=boxCox(td_mod)
lambda = td_mod.bc$x[which(td_mod.bc$y == max(td_mod.bc$y))]
raw_data$rk_fedex.bc = (raw_data$rk_fedex^lambda - 1)/lambda
td_mod.bc=lm(rk_fedex.bc~.-rk_fedex,raw_data)
summary(td_mod.bc)
plot(td_mod.bc) #Looks like a reasonable model, let's try a reduced model based on significant variables

#Test Box Cox transformed dependent variable on the reduced variable set
td_mod_red.bc=lm(rk_fedex.bc~drd+dra+gir+ssv+pth+pthatg+pmd+ppr,raw_data)
summary(td_mod_red.bc)
plot(td_mod_red.bc)
vif(td_mod_red.bc)
avPlots(td_mod_red.bc)
#Looks like a decent model, but let's see if there's a "best model"

#Check full vs reduced
AIC(td_mod_red.bc,td_mod.bc)
BIC(td_mod_red.bc,td_mod.bc)
#AIC and BIC don't show a clear advantage

#Alternatively, let's do a stepwise regression and see what set of variables is identified
model.empty=lm(rk_fedex.bc~1,raw_data)
model.full=lm(rk_fedex.bc~.-rk_fedex,raw_data)
scope=list(lower = formula(model.empty), upper = formula(model.full))

forwardAIC = step(model.empty, scope, direction = "forward", k = 2)
backwardAIC = step(model.full, scope, direction = "backward", k = 2)
bothAIC.empty = step(model.empty, scope, direction = "both", k = 2)
bothAIC.full = step(model.full, scope, direction = "both", k = 2)

#Stepwise regression using BIC as the criteria (the penalty k = log(n)).
forwardBIC = step(model.empty, scope, direction = "forward", k = log(196))
backwardBIC = step(model.full, scope, direction = "backward", k = log(196))
bothBIC.empty = step(model.empty, scope, direction = "both", k = log(196))
bothBIC.full = step(model.full, scope, direction = "both", k = log(196))

#BIC identifies a subset, similar to the SG model statistics but includes some overlapping areas, dra, pmd
#after reviewing VIF, we can safely remove dra and pmd on concerns about collinearity.  Resulting variable set follows:
td_mod_red2.bc=lm(rk_fedex.bc~drd+gir+pthatg+ppr,raw_data)
summary(td_mod_red2.bc)

#Looks like resonably good model, but check assumptions
plot(td_mod_red2.bc)
vif(td_mod_red2.bc)
avPlots(td_mod_red2.bc)

###Which Model is best?
AIC(td_mod_red2.bc,sg_sat_mod.bc)
BIC(td_mod_red2.bc,sg_sat_mod.bc)

#Test to see how well it predicts
raw_mod.predicrat=data.frame(predict(td_mod_red2.bc,raw_test_complete,interval='confidence'))
summary(raw_mod.predict)
sg_test_actual$raw_mod_predict=raw_mod.predict$fit
sg_test_actual$raw_fit_error=sg_test_actual$rk_fedex.bc-sg_test_actual$raw_mod_predict
sg_test_actual$raw_error_sq = sg_test_actual$raw_fit_error**2

#Compare the two models
mean(sg_test_actual$raw_error_sq)
sd(sg_test_actual$raw_error_sq)
mean(sg_test_actual$error_sq)
sd(sg_test_actual$error_sq)
	library(stats)
	library(VIM)
	library(mice)
	library(car)
	library(Hmisc)
	library(dplyr)
	setwd('~/R')

	#import and prepare the master data.
	full_golf=read.csv('full_golf_new.csv',strip.white=T,stringsAsFactors=FALSE)
	row.names(full_golf)=paste0(full_golf$name,full_golf$year)
	full_golf=full_golf[,-which(names(full_golf) %in% c('X','name'))]
	golf_fedex = full_golf[is.na(full_golf$rk_fedex)==F,]
	golf_fedex$rk_fedex=golf_fedex$rk_fedex/golf_fedex$rk_fedex_variable2
	summary(golf_fedex)
	aggr(golf_fedex) #Lots of missing data, but is it relevant data? Get a good subset of data first

	########################################################################################################
	################################Shots Gained v fedex Earned#############################################
	########################################################################################################

	#Prepare the dataset
	sg_data=golf_fedex[golf_fedex$year!=2016,c('rk_fedex','sg_ott','sg_aptg','sg_artg','sg_putt')]
	sg_test=golf_fedex[golf_fedex$year==2016,c('sg_ott','sg_aptg','sg_artg','sg_putt')]
	sg_test_actual=golf_fedex[golf_fedex$year==2016,c('rk_fedex','sg_ott','sg_aptg','sg_artg','sg_putt')]
	sg_test_actual=sg_test_actual[which(complete.cases(sg_test_actual)),]
	aggr(sg_data)
	md.pattern(sg_data) #Missing Predictor Variables... not really imputable, so we won't test these, fair to drop
	#because this data only recorded for top 200 players, the others will be OUTSIDE of the other data
	sg_data_complete=sg_data[which(complete.cases(sg_data)),]
	sg_test_complete=sg_test[which(complete.cases(sg_test)),]
	aggr(sg_data_complete)
	summary(sg_data_complete) #Data is prepped for testing

	#Fit a multiple linear regression
	sg.saturated=lm(rk_fedex~.,sg_data_complete)
	summary(sg.saturated)
	vif(sg.saturated)
	avPlots(sg.saturated)
	plot(sg.saturated)

	#Better Box-Cox transform this
	sg_sat.bc=boxCox(sg.saturated)
	lambda = sg_sat.bc$x[which(sg_sat.bc$y == max(sg_sat.bc$y))]
	sg_data_complete$rk_fedex.bc = (sg_data_complete$rk_fedex^lambda - 1)/lambda
	sg_sat_mod.bc=lm(rk_fedex.bc~sg_ott +sg_aptg +sg_artg +sg_putt,sg_data_complete)
	summary(sg_sat_mod.bc)
	plot(sg_sat_mod.bc)
	vif(sg_sat_mod.bc)
	avPlots(sg_sat_mod.bc)
	#Assumptions look good
	#Looks like multiple linear regression model may hold.
	#But, let's see if we need all of these variables...

	#prepare less fit models
	model.empty=lm(rk_fedex.bc~1,sg_data_complete)
	model.full=lm(rk_fedex.bc~.-rk_fedex,sg_data_complete)
	scope=list(lower = formula(model.empty), upper = formula(model.full))

	forwardAIC = step(model.empty, scope, direction = "forward", k = 2)
	backwardAIC = step(model.full, scope, direction = "backward", k = 2)
	bothAIC.empty = step(model.empty, scope, direction = "both", k = 2)
	bothAIC.full = step(model.full, scope, direction = "both", k = 2)

	#Stepwise regression using BIC as the criteria (the penalty k = log(n)).
	forwardBIC = step(model.empty, scope, direction = "forward", k = log(196))
	backwardBIC = step(model.full, scope, direction = "backward", k = log(196))
	bothBIC.empty = step(model.empty, scope, direction = "both", k = log(196))
	bothBIC.full = step(model.full, scope, direction = "both", k = log(196))

	#Looks like combination of the four stats will work fine, let's predict 2016 output
	fedex_predict_2016=predict(sg_sat_mod.bc,sg_test_complete,interval='confidence')
	sg_test_actual$rk_fedex.bc = (sg_test_actual$rk_fedex^lambda - 1)/lambda
	sg_test_actual=cbind(sg_test_actual,fedex_predict_2016)
	sg_test_actual$fit_error=sg_test_actual$rk_fedex.bc-sg_test_actual$fit
	sg_test_actual$error_sq=sg_test_actual$fit_error**2
	sg_total_error=sum(sg_test_actual$error_sq)


	#LET'S MAKE OUR OWN MODEL

	########################################################################################################
	##############################Other Variables v fedex Earned############################################
	########################################################################################################

	#prepare a data set
	raw_data=golf_fedex[golf_fedex$year!=2016,c('rk_fedex','drd','dra','gir','ssv','scr','pth','pthatg','pmd','ppr')]
	raw_test=golf_fedex[golf_fedex$year==2016,c('drd','dra','gir','ssv','scr','pth','pthatg','pmd','ppr')]
	raw_test_actual=golf_fedex[golf_fedex$year==2016,c('rk_fedex','drd','dra','gir','ssv','scr','pth','pthatg','pmd','ppr')]
	aggr(raw_data) #Again, the lower-ranked golfers, so hard to impute, since they all fall outside of the data ranges.
	raw_data=raw_data[which(complete.cases(raw_data)),]
	raw_test_complete=raw_test[which(complete.cases(raw_test)),]
	aggr(raw_data)

	#fit multiple linear model on all data
	td_mod = lm(rk_fedex~.,raw_data)
	summary(td_mod)
	td_mod_summary = summary(td_mod)
	avPlots(td_mod)
	plot(td_mod) #Definitely some relationship, but it's not really "linear", maybe we can make it so with box-cox transform
	#First, let's build a model with reduced variable set and see how that looks.

	##########################MAKE MY OWN REDUCED MODEL################################
	td_mod_red = lm(rk_fedex~drd+dra+gir+ssv+pth+pthatg+pmd+ppr,raw_data)
	summary(td_mod_red)
	plot(td_mod_red)
	vif(td_mod_red)

	#Box Cox transform
	td_mod.bc=boxCox(td_mod)
	lambda = td_mod.bc$x[which(td_mod.bc$y == max(td_mod.bc$y))]
	raw_data$rk_fedex.bc = (raw_data$rk_fedex^lambda - 1)/lambda
	td_mod.bc=lm(rk_fedex.bc~.-rk_fedex,raw_data)
	summary(td_mod.bc)
	plot(td_mod.bc) #Looks like a reasonable model, let's try a reduced model based on significant variables

	#Test Box Cox transformed dependent variable on the reduced variable set
	td_mod_red.bc=lm(rk_fedex.bc~drd+dra+gir+ssv+pth+pthatg+pmd+ppr,raw_data)
	summary(td_mod_red.bc)
	plot(td_mod_red.bc)
	vif(td_mod_red.bc)
	avPlots(td_mod_red.bc)
	#Looks like a decent model, but let's see if there's a "best model"

	#Check full vs reduced
	AIC(td_mod_red.bc,td_mod.bc)
	BIC(td_mod_red.bc,td_mod.bc)
	#AIC and BIC don't show a clear advantage

	#Alternatively, let's do a stepwise regression and see what set of variables is identified
	model.empty=lm(rk_fedex.bc~1,raw_data)
	model.full=lm(rk_fedex.bc~.-rk_fedex,raw_data)
	scope=list(lower = formula(model.empty), upper = formula(model.full))

	forwardAIC = step(model.empty, scope, direction = "forward", k = 2)
	backwardAIC = step(model.full, scope, direction = "backward", k = 2)
	bothAIC.empty = step(model.empty, scope, direction = "both", k = 2)
	bothAIC.full = step(model.full, scope, direction = "both", k = 2)

	#Stepwise regression using BIC as the criteria (the penalty k = log(n)).
	forwardBIC = step(model.empty, scope, direction = "forward", k = log(196))
	backwardBIC = step(model.full, scope, direction = "backward", k = log(196))
	bothBIC.empty = step(model.empty, scope, direction = "both", k = log(196))
	bothBIC.full = step(model.full, scope, direction = "both", k = log(196))

	#BIC identifies a subset, similar to the SG model statistics but includes some overlapping areas, dra, pmd
	#after reviewing VIF, we can safely remove dra and pmd on concerns about collinearity. Resulting variable set follows:
	td_mod_red2.bc=lm(rk_fedex.bc~drd+gir+pthatg+ppr,raw_data)
	summary(td_mod_red2.bc)

	#Looks like resonably good model, but check assumptions
	plot(td_mod_red2.bc)
	vif(td_mod_red2.bc)
	avPlots(td_mod_red2.bc)

	###Which Model is best?
	AIC(td_mod_red2.bc,sg_sat_mod.bc)
	BIC(td_mod_red2.bc,sg_sat_mod.bc)

	#Test to see how well it predicts
	raw_mod.predicrat=data.frame(predict(td_mod_red2.bc,raw_test_complete,interval='confidence'))
	summary(raw_mod.predict)
	sg_test_actual$raw_mod_predict=raw_mod.predict$fit
	sg_test_actual$raw_fit_error=sg_test_actual$rk_fedex.bc-sg_test_actual$raw_mod_predict
	sg_test_actual$raw_error_sq = sg_test_actual$raw_fit_error**2

	#Compare the two models
	mean(sg_test_actual$raw_error_sq)
	sd(sg_test_actual$raw_error_sq)
	mean(sg_test_actual$error_sq)
	sd(sg_test_actual$error_sq)