bayesball/efron_morris.R

## efron_morris.R
# R script for
# "Revisiting Efron and Morris's Data"
# blog post of February 15, 2016

# in current working directory need a download folder with two subfolders
# zipped and unzipped
# (For a Windows computer, you need to have the Chadwick cwevent.exe
# inside the “upzipped” folder.)

library(devtools)
library(ggplot2)
library(dplyr)
library(LearnBayes)
library(Lahman)

# read in parse.retrosheet2.pbp function
# read in data from 1970 season

source_gist("8892981")
parse.retrosheet2.pbp(1970)

# navigate to download.folder/unzipped

d <- read.csv("download.folder/unzipped/all1970.csv",
              header=FALSE)
fields <- read.csv("http://personal.bgsu.edu/~albert/baseball/fields.csv")
names(d) <- fields$Header

# get hit data for all players in the month of April
# collect hit data through April 24 and restrict to batters who had
# between 44 and 46 AB

april24 <- filter(d, substr(GAME_ID, 9, 9) == "4",
              as.numeric(substr(GAME_ID, 10, 11)) <= 24)

hit_data <- summarize(group_by(april24, BAT_ID),
                      AB=sum(AB_FL), H=sum(H_FL > 0))

efron.morris <- filter(hit_data, AB >= 44, AB <= 46)

# merge with first and last names from Master data frame from Lahman package

efron.morris <- inner_join(select(Master, retroID, nameFirst, nameLast),
           efron.morris, by=c("retroID"="BAT_ID"))

# fit beta binomial model
# read in fit.model function (uses LearnBayes function)

source_gist("8f51899930192776bfd8")
fit <- fit.model(data.frame(y=efron.morris$H, n=efron.morris$AB))

fit$eta
fit$K
fit$d

# plot estimates and observed values

efron.morris$Estimate <- fit$d$est
efron.morris$Player <- with(efron.morris,
                            factor(paste(nameFirst, nameLast)))
df <- rbind(data.frame(Type="Observed",
                       AVG=with(efron.morris, H / AB),
                       Player=efron.morris$Player),
            data.frame(Type="Estimated",
                       AVG=efron.morris$Estimate,
                       Player=efron.morris$Player))

p <- ggplot(data=df, aes(x=Type, y=AVG,
                    color=Player)) +
  geom_line(aes(group=Player)) +
  ggtitle("Shrinkage of Batting Averages")
print(p)

# prediction

# collect number of season AB for the 1970 season and use this to
# compute the number of future AB

season_data <- summarize(
      group_by(filter(d, BAT_ID %in% efron.morris$retroID), BAT_ID),
                  AB=sum(AB_FL), H=sum(H_FL > 0))

mutate(inner_join(efron.morris, season_data, c("retroID"="BAT_ID")),
         AB.future=AB.y - AB.x) -> efron.morris

# simulate predictive distribution for a specific player

predict.avg <- function(player, n.sim=1000){
  pd <- filter(efron.morris, retroID==player)
  beta.par <- c(fit$K * fit$eta + pd$H.x,
                fit$K * (1 - fit$eta) + pd$AB.x - pd$H.x)
  p <- rbeta(n.sim, beta.par[1], beta.par[2])
  H.future <- rbinom(n.sim, size=pd$AB.future, prob=p)
  data.frame(BAT_ID=player, AVG=H.future / pd$AB.future)
}

# collect predictions for all players

predictions <- do.call("rbind",
              lapply(efron.morris$retroID, predict.avg))
predictions <- inner_join(predictions,
                 select(efron.morris, retroID, nameFirst, nameLast),
                 by=c("BAT_ID"="retroID"))
predictions <- mutate(predictions,
                      Name=factor(paste(nameFirst, nameLast)))

# graph density estimates for all predictive simulations

p <- ggplot(predictions, aes(AVG)) + geom_density() +
 facet_wrap(~ Name, ncol=3) +
 ggtitle("Prediction of Future AVG for Batters in 1970 Season")
print(p)
	# R script for
	# "Revisiting Efron and Morris's Data"
	# blog post of February 15, 2016

	# in current working directory need a download folder with two subfolders
	# zipped and unzipped
	# (For a Windows computer, you need to have the Chadwick cwevent.exe
	# inside the “upzipped” folder.)

	library(devtools)
	library(ggplot2)
	library(dplyr)
	library(LearnBayes)
	library(Lahman)

	# read in parse.retrosheet2.pbp function
	# read in data from 1970 season

	source_gist("8892981")
	parse.retrosheet2.pbp(1970)

	# navigate to download.folder/unzipped

	d <- read.csv("download.folder/unzipped/all1970.csv",
	header=FALSE)
	fields <- read.csv("http://personal.bgsu.edu/~albert/baseball/fields.csv")
	names(d) <- fields$Header

	# get hit data for all players in the month of April
	# collect hit data through April 24 and restrict to batters who had
	# between 44 and 46 AB

	april24 <- filter(d, substr(GAME_ID, 9, 9) == "4",
	as.numeric(substr(GAME_ID, 10, 11)) <= 24)

	hit_data <- summarize(group_by(april24, BAT_ID),
	AB=sum(AB_FL), H=sum(H_FL > 0))

	efron.morris <- filter(hit_data, AB >= 44, AB <= 46)

	# merge with first and last names from Master data frame from Lahman package

	efron.morris <- inner_join(select(Master, retroID, nameFirst, nameLast),
	efron.morris, by=c("retroID"="BAT_ID"))

	# fit beta binomial model
	# read in fit.model function (uses LearnBayes function)

	source_gist("8f51899930192776bfd8")
	fit <- fit.model(data.frame(y=efron.morris$H, n=efron.morris$AB))

	fit$eta
	fit$K
	fit$d

	# plot estimates and observed values

	efron.morris$Estimate <- fit$d$est
	efron.morris$Player <- with(efron.morris,
	factor(paste(nameFirst, nameLast)))
	df <- rbind(data.frame(Type="Observed",
	AVG=with(efron.morris, H / AB),
	Player=efron.morris$Player),
	data.frame(Type="Estimated",
	AVG=efron.morris$Estimate,
	Player=efron.morris$Player))

	p <- ggplot(data=df, aes(x=Type, y=AVG,
	color=Player)) +
	geom_line(aes(group=Player)) +
	ggtitle("Shrinkage of Batting Averages")
	print(p)

	# prediction

	# collect number of season AB for the 1970 season and use this to
	# compute the number of future AB

	season_data <- summarize(
	group_by(filter(d, BAT_ID %in% efron.morris$retroID), BAT_ID),
	AB=sum(AB_FL), H=sum(H_FL > 0))

	mutate(inner_join(efron.morris, season_data, c("retroID"="BAT_ID")),
	AB.future=AB.y - AB.x) -> efron.morris

	# simulate predictive distribution for a specific player

	predict.avg <- function(player, n.sim=1000){
	pd <- filter(efron.morris, retroID==player)
	beta.par <- c(fit$K * fit$eta + pd$H.x,
	fit$K * (1 - fit$eta) + pd$AB.x - pd$H.x)
	p <- rbeta(n.sim, beta.par[1], beta.par[2])
	H.future <- rbinom(n.sim, size=pd$AB.future, prob=p)
	data.frame(BAT_ID=player, AVG=H.future / pd$AB.future)
	}

	# collect predictions for all players

	predictions <- do.call("rbind",
	lapply(efron.morris$retroID, predict.avg))
	predictions <- inner_join(predictions,
	select(efron.morris, retroID, nameFirst, nameLast),
	by=c("BAT_ID"="retroID"))
	predictions <- mutate(predictions,
	Name=factor(paste(nameFirst, nameLast)))

	# graph density estimates for all predictive simulations

	p <- ggplot(predictions, aes(AVG)) + geom_density() +
	facet_wrap(~ Name, ncol=3) +
	ggtitle("Prediction of Future AVG for Batters in 1970 Season")
	print(p)