Skip to content

Instantly share code, notes, and snippets.

@bayesball
Last active February 20, 2016 00:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save bayesball/15557fa4648ed4755500 to your computer and use it in GitHub Desktop.
Save bayesball/15557fa4648ed4755500 to your computer and use it in GitHub Desktop.
Modeling using Efron and Morris's famous dataset
# R script for
# "Revisiting Efron and Morris's Data"
# blog post of February 15, 2016
# in current working directory need a download folder with two subfolders
# zipped and unzipped
# (For a Windows computer, you need to have the Chadwick cwevent.exe
# inside the “upzipped” folder.)
library(devtools)
library(ggplot2)
library(dplyr)
library(LearnBayes)
library(Lahman)
# read in parse.retrosheet2.pbp function
# read in data from 1970 season
source_gist("8892981")
parse.retrosheet2.pbp(1970)
# navigate to download.folder/unzipped
d <- read.csv("download.folder/unzipped/all1970.csv",
header=FALSE)
fields <- read.csv("http://personal.bgsu.edu/~albert/baseball/fields.csv")
names(d) <- fields$Header
# get hit data for all players in the month of April
# collect hit data through April 24 and restrict to batters who had
# between 44 and 46 AB
april24 <- filter(d, substr(GAME_ID, 9, 9) == "4",
as.numeric(substr(GAME_ID, 10, 11)) <= 24)
hit_data <- summarize(group_by(april24, BAT_ID),
AB=sum(AB_FL), H=sum(H_FL > 0))
efron.morris <- filter(hit_data, AB >= 44, AB <= 46)
# merge with first and last names from Master data frame from Lahman package
efron.morris <- inner_join(select(Master, retroID, nameFirst, nameLast),
efron.morris, by=c("retroID"="BAT_ID"))
# fit beta binomial model
# read in fit.model function (uses LearnBayes function)
source_gist("8f51899930192776bfd8")
fit <- fit.model(data.frame(y=efron.morris$H, n=efron.morris$AB))
fit$eta
fit$K
fit$d
# plot estimates and observed values
efron.morris$Estimate <- fit$d$est
efron.morris$Player <- with(efron.morris,
factor(paste(nameFirst, nameLast)))
df <- rbind(data.frame(Type="Observed",
AVG=with(efron.morris, H / AB),
Player=efron.morris$Player),
data.frame(Type="Estimated",
AVG=efron.morris$Estimate,
Player=efron.morris$Player))
p <- ggplot(data=df, aes(x=Type, y=AVG,
color=Player)) +
geom_line(aes(group=Player)) +
ggtitle("Shrinkage of Batting Averages")
print(p)
# prediction
# collect number of season AB for the 1970 season and use this to
# compute the number of future AB
season_data <- summarize(
group_by(filter(d, BAT_ID %in% efron.morris$retroID), BAT_ID),
AB=sum(AB_FL), H=sum(H_FL > 0))
mutate(inner_join(efron.morris, season_data, c("retroID"="BAT_ID")),
AB.future=AB.y - AB.x) -> efron.morris
# simulate predictive distribution for a specific player
predict.avg <- function(player, n.sim=1000){
pd <- filter(efron.morris, retroID==player)
beta.par <- c(fit$K * fit$eta + pd$H.x,
fit$K * (1 - fit$eta) + pd$AB.x - pd$H.x)
p <- rbeta(n.sim, beta.par[1], beta.par[2])
H.future <- rbinom(n.sim, size=pd$AB.future, prob=p)
data.frame(BAT_ID=player, AVG=H.future / pd$AB.future)
}
# collect predictions for all players
predictions <- do.call("rbind",
lapply(efron.morris$retroID, predict.avg))
predictions <- inner_join(predictions,
select(efron.morris, retroID, nameFirst, nameLast),
by=c("BAT_ID"="retroID"))
predictions <- mutate(predictions,
Name=factor(paste(nameFirst, nameLast)))
# graph density estimates for all predictive simulations
p <- ggplot(predictions, aes(AVG)) + geom_density() +
facet_wrap(~ Name, ncol=3) +
ggtitle("Prediction of Future AVG for Batters in 1970 Season")
print(p)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment