Created
December 21, 2019 16:17
-
-
Save bayesball/1e0680ce384ee0d166416bec6b06862d to your computer and use it in GitHub Desktop.
Random effects model fitting of Retrosheet home run rates of batters and pitchers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# function fit_model() fits a non-nested random | |
# effects model to home run rates for a particular | |
# season | |
# input is a Retrosheet play-by-play dataset | |
# output is a list with four components | |
# - data frame containing all batter-pitcher matchups | |
# - graph of this data | |
# - random effect estimates from model fit | |
# - estimates at the standard deviations of batter and pitcher effects | |
fit_model <- function(d){ | |
require(tidyverse) | |
require(lme4) | |
require(CalledStrike) | |
# only consider balls in play | |
d_ip <- filter(d, EVENT_CD %in% c(2, 18:23)) %>% | |
mutate(HR = ifelse(EVENT_CD == 23, 1, 0)) | |
# find sample size and hr count for each combination | |
# of batter and pitcher | |
d_ip %>% | |
group_by(BAT_ID, PIT_ID) %>% | |
summarize(y = sum(HR), N = n()) -> S3 | |
# construct plot of balls in play against hr | |
# rate, color point by hr count | |
S3$HR_Count <- as.character(S3$y) | |
p1 <- ggplot(S3, aes(N, y / N, color = HR_Count)) + | |
geom_jitter(width = 0.2, height = 0.03) + | |
ylim(-0.05, 1.05) + | |
increasefont() + | |
xlab("Balls in Play") + | |
ylab("HR Rate") + | |
centertitle() + | |
ggtitle("Balls in Play and HR Rates\nfor all Batter-Pitcher Matchups") | |
# only look at matchups with at least 4 | |
# balls in play | |
S3_4 <- filter(S3, N >= 4) | |
# fit random effects model | |
cfit <- glmer(cbind(y, N - y)~ | |
1 + (1 | BAT_ID) + (1 | PIT_ID), | |
data=S3_4, | |
family="binomial") | |
# output standard deviation estimates | |
# mean and random effects | |
vv <- VarCorr(cfit) | |
intercept <- fixef(cfit) | |
est <- ranef(cfit) | |
# save random effects in data frame | |
# put random effs on probability scale | |
newout1 <- data.frame(Type = "Batter", | |
Estimate = invlogit(intercept + | |
est$BAT_ID[, 1])) | |
newout2 <- data.frame(Type = "Pitcher", | |
Estimate = invlogit(intercept + | |
est$PIT_ID[, 1])) | |
# output | |
list(S = select(S3_4, BAT_ID, PIT_ID, y, N), | |
plot = p1, | |
estimates = rbind(newout1, newout2), | |
sd = c(attr(vv[[1]], "stddev"), | |
attr(vv[[2]], "stddev"))) | |
} | |
# illustrate function for 2019 retrosheet data | |
fit_model(d2019) -> out_19 | |
# here is the pitcher-batter matchup data | |
head(out_19$S) | |
# show some interesting matchups | |
out_19$S %>% arrange(desc(y)) %>% head() | |
# balls in play vs hr rate plot | |
out_19$plot | |
# estimates at sds | |
out_19$sd | |
# density estimates of the pitcher and batter | |
# random effects | |
ggplot(out_19$estimates, | |
aes(Estimate, color = Type)) + | |
geom_density() | |
# fit model using Retrosheet data for all | |
# seasons from 2000 through 2019 | |
output <- data.frame(Season = 2000:2019, | |
SD_batter = 0, | |
SD_pitcher = 0) | |
k <- 0 | |
for(season in 2000:2013){ | |
k <- k + 1 | |
out <- fit_model(filter(pbp.00.13, Season == season)) | |
output[k, 2] <- out$sd[1] | |
output[k, 3] <- out$sd[2] | |
} | |
output[15, 2:3] <- fit_model(pbp.14)$sd | |
output[16, 2:3] <- fit_model(d2015)$sd | |
output[17, 2:3] <- fit_model(d2016)$sd | |
output[18, 2:3] <- fit_model(d2017)$sd | |
output[19, 2:3] <- fit_model(d2018)$sd | |
output[20, 2:3] <- fit_model(d2019)$sd | |
# plots of standard deviation estimates | |
# against season for last 20 seasons | |
ggplot(output, aes(Season, SD_batter)) + | |
geom_point(size = 2, color = "red") + | |
geom_smooth(se = FALSE, method = "loess") + | |
increasefont() + | |
ggtitle("Batter Standard Deviation: 2000-2019") + | |
centertitle() | |
ggplot(output, aes(Season, SD_pitcher)) + | |
geom_point(size = 2, color = "red") + | |
geom_smooth(se = FALSE, method = "loess")+ | |
increasefont() + | |
ggtitle("Pitcher Standard Deviation: 2000-2019") + | |
centertitle() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment