Last active
February 20, 2016 00:14
-
-
Save bayesball/15557fa4648ed4755500 to your computer and use it in GitHub Desktop.
Modeling using Efron and Morris's famous dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# R script for | |
# "Revisiting Efron and Morris's Data" | |
# blog post of February 15, 2016 | |
# in current working directory need a download folder with two subfolders | |
# zipped and unzipped | |
# (For a Windows computer, you need to have the Chadwick cwevent.exe | |
# inside the “upzipped” folder.) | |
library(devtools) | |
library(ggplot2) | |
library(dplyr) | |
library(LearnBayes) | |
library(Lahman) | |
# read in parse.retrosheet2.pbp function | |
# read in data from 1970 season | |
source_gist("8892981") | |
parse.retrosheet2.pbp(1970) | |
# navigate to download.folder/unzipped | |
d <- read.csv("download.folder/unzipped/all1970.csv", | |
header=FALSE) | |
fields <- read.csv("http://personal.bgsu.edu/~albert/baseball/fields.csv") | |
names(d) <- fields$Header | |
# get hit data for all players in the month of April | |
# collect hit data through April 24 and restrict to batters who had | |
# between 44 and 46 AB | |
april24 <- filter(d, substr(GAME_ID, 9, 9) == "4", | |
as.numeric(substr(GAME_ID, 10, 11)) <= 24) | |
hit_data <- summarize(group_by(april24, BAT_ID), | |
AB=sum(AB_FL), H=sum(H_FL > 0)) | |
efron.morris <- filter(hit_data, AB >= 44, AB <= 46) | |
# merge with first and last names from Master data frame from Lahman package | |
efron.morris <- inner_join(select(Master, retroID, nameFirst, nameLast), | |
efron.morris, by=c("retroID"="BAT_ID")) | |
# fit beta binomial model | |
# read in fit.model function (uses LearnBayes function) | |
source_gist("8f51899930192776bfd8") | |
fit <- fit.model(data.frame(y=efron.morris$H, n=efron.morris$AB)) | |
fit$eta | |
fit$K | |
fit$d | |
# plot estimates and observed values | |
efron.morris$Estimate <- fit$d$est | |
efron.morris$Player <- with(efron.morris, | |
factor(paste(nameFirst, nameLast))) | |
df <- rbind(data.frame(Type="Observed", | |
AVG=with(efron.morris, H / AB), | |
Player=efron.morris$Player), | |
data.frame(Type="Estimated", | |
AVG=efron.morris$Estimate, | |
Player=efron.morris$Player)) | |
p <- ggplot(data=df, aes(x=Type, y=AVG, | |
color=Player)) + | |
geom_line(aes(group=Player)) + | |
ggtitle("Shrinkage of Batting Averages") | |
print(p) | |
# prediction | |
# collect number of season AB for the 1970 season and use this to | |
# compute the number of future AB | |
season_data <- summarize( | |
group_by(filter(d, BAT_ID %in% efron.morris$retroID), BAT_ID), | |
AB=sum(AB_FL), H=sum(H_FL > 0)) | |
mutate(inner_join(efron.morris, season_data, c("retroID"="BAT_ID")), | |
AB.future=AB.y - AB.x) -> efron.morris | |
# simulate predictive distribution for a specific player | |
predict.avg <- function(player, n.sim=1000){ | |
pd <- filter(efron.morris, retroID==player) | |
beta.par <- c(fit$K * fit$eta + pd$H.x, | |
fit$K * (1 - fit$eta) + pd$AB.x - pd$H.x) | |
p <- rbeta(n.sim, beta.par[1], beta.par[2]) | |
H.future <- rbinom(n.sim, size=pd$AB.future, prob=p) | |
data.frame(BAT_ID=player, AVG=H.future / pd$AB.future) | |
} | |
# collect predictions for all players | |
predictions <- do.call("rbind", | |
lapply(efron.morris$retroID, predict.avg)) | |
predictions <- inner_join(predictions, | |
select(efron.morris, retroID, nameFirst, nameLast), | |
by=c("BAT_ID"="retroID")) | |
predictions <- mutate(predictions, | |
Name=factor(paste(nameFirst, nameLast))) | |
# graph density estimates for all predictive simulations | |
p <- ggplot(predictions, aes(AVG)) + geom_density() + | |
facet_wrap(~ Name, ncol=3) + | |
ggtitle("Prediction of Future AVG for Batters in 1970 Season") | |
print(p) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment