Skip to content

Instantly share code, notes, and snippets.

@bayesball
Created December 21, 2019 16:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/1e0680ce384ee0d166416bec6b06862d to your computer and use it in GitHub Desktop.
Save bayesball/1e0680ce384ee0d166416bec6b06862d to your computer and use it in GitHub Desktop.
Random effects model fitting of Retrosheet home run rates of batters and pitchers
# function fit_model() fits a non-nested random
# effects model to home run rates for a particular
# season
# input is a Retrosheet play-by-play dataset
# output is a list with four components
# - data frame containing all batter-pitcher matchups
# - graph of this data
# - random effect estimates from model fit
# - estimates at the standard deviations of batter and pitcher effects
fit_model <- function(d){
require(tidyverse)
require(lme4)
require(CalledStrike)
# only consider balls in play
d_ip <- filter(d, EVENT_CD %in% c(2, 18:23)) %>%
mutate(HR = ifelse(EVENT_CD == 23, 1, 0))
# find sample size and hr count for each combination
# of batter and pitcher
d_ip %>%
group_by(BAT_ID, PIT_ID) %>%
summarize(y = sum(HR), N = n()) -> S3
# construct plot of balls in play against hr
# rate, color point by hr count
S3$HR_Count <- as.character(S3$y)
p1 <- ggplot(S3, aes(N, y / N, color = HR_Count)) +
geom_jitter(width = 0.2, height = 0.03) +
ylim(-0.05, 1.05) +
increasefont() +
xlab("Balls in Play") +
ylab("HR Rate") +
centertitle() +
ggtitle("Balls in Play and HR Rates\nfor all Batter-Pitcher Matchups")
# only look at matchups with at least 4
# balls in play
S3_4 <- filter(S3, N >= 4)
# fit random effects model
cfit <- glmer(cbind(y, N - y)~
1 + (1 | BAT_ID) + (1 | PIT_ID),
data=S3_4,
family="binomial")
# output standard deviation estimates
# mean and random effects
vv <- VarCorr(cfit)
intercept <- fixef(cfit)
est <- ranef(cfit)
# save random effects in data frame
# put random effs on probability scale
newout1 <- data.frame(Type = "Batter",
Estimate = invlogit(intercept +
est$BAT_ID[, 1]))
newout2 <- data.frame(Type = "Pitcher",
Estimate = invlogit(intercept +
est$PIT_ID[, 1]))
# output
list(S = select(S3_4, BAT_ID, PIT_ID, y, N),
plot = p1,
estimates = rbind(newout1, newout2),
sd = c(attr(vv[[1]], "stddev"),
attr(vv[[2]], "stddev")))
}
# illustrate function for 2019 retrosheet data
fit_model(d2019) -> out_19
# here is the pitcher-batter matchup data
head(out_19$S)
# show some interesting matchups
out_19$S %>% arrange(desc(y)) %>% head()
# balls in play vs hr rate plot
out_19$plot
# estimates at sds
out_19$sd
# density estimates of the pitcher and batter
# random effects
ggplot(out_19$estimates,
aes(Estimate, color = Type)) +
geom_density()
# fit model using Retrosheet data for all
# seasons from 2000 through 2019
output <- data.frame(Season = 2000:2019,
SD_batter = 0,
SD_pitcher = 0)
k <- 0
for(season in 2000:2013){
k <- k + 1
out <- fit_model(filter(pbp.00.13, Season == season))
output[k, 2] <- out$sd[1]
output[k, 3] <- out$sd[2]
}
output[15, 2:3] <- fit_model(pbp.14)$sd
output[16, 2:3] <- fit_model(d2015)$sd
output[17, 2:3] <- fit_model(d2016)$sd
output[18, 2:3] <- fit_model(d2017)$sd
output[19, 2:3] <- fit_model(d2018)$sd
output[20, 2:3] <- fit_model(d2019)$sd
# plots of standard deviation estimates
# against season for last 20 seasons
ggplot(output, aes(Season, SD_batter)) +
geom_point(size = 2, color = "red") +
geom_smooth(se = FALSE, method = "loess") +
increasefont() +
ggtitle("Batter Standard Deviation: 2000-2019") +
centertitle()
ggplot(output, aes(Season, SD_pitcher)) +
geom_point(size = 2, color = "red") +
geom_smooth(se = FALSE, method = "loess")+
increasefont() +
ggtitle("Pitcher Standard Deviation: 2000-2019") +
centertitle()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment