Last active
March 16, 2022 15:03
-
-
Save bayesball/3e050600042aefa614a476cf03d5326f to your computer and use it in GitHub Desktop.
Fit of a nonnested multilevel model to wOBA data for the 2021 baseball season
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load in packages | |
library(readr) | |
library(dplyr) | |
library(brms) | |
library(ggplot2) | |
library(posterior) | |
library(gridExtra) | |
library(Lahman) | |
# load 2021 data and Fangraphs data | |
load("~/Dropbox/Google Drive/Retrosheet/pbp.2021.Rdata") | |
fg_guts <- | |
read_csv("https://raw.githubusercontent.com/bayesball/HomeRuns2021/main/fg_guts.csv") | |
# only consider batting events and collect | |
# key variables | |
d2021 %>% | |
filter(BAT_EVENT_FL == TRUE) %>% | |
select(PIT_ID, BAT_ID, EVENT_CD) %>% | |
mutate(Season = 2021) -> d2021a | |
# compute wOBA weights | |
d2021a %>% | |
inner_join(fg_guts, by = "Season") %>% | |
mutate(WT = wBB * (EVENT_CD %in% 14:15) + | |
wHBP * (EVENT_CD == 16) + | |
w1B * (EVENT_CD == 20) + | |
w2B * (EVENT_CD == 21) + | |
w3B * (EVENT_CD == 22) + | |
wHR * (EVENT_CD == 23)) %>% | |
select(PIT_ID, BAT_ID, EVENT_CD, WT) -> d2021a | |
# brms fit | |
fitB <- brm(WT ~ 1 + (1 | PIT_ID) + (1 | BAT_ID), | |
data = d2021a) | |
# collect draws and summarize | |
par_sim <- as_draws(fitB) | |
S <- summarize_draws(par_sim) | |
# plot density estimates of standard deviations | |
sim_PIT <- unlist(subset_draws(par_sim, | |
variable = "sd_PIT_ID__Intercept")) | |
sim_BAT <- unlist(subset_draws(par_sim, | |
variable = "sd_BAT_ID__Intercept")) | |
df <- rbind(data.frame(SD = sim_PIT, Type = "Pitcher"), | |
data.frame(SD = sim_BAT, Type = "Batter")) | |
ggplot(df, aes(SD, color = Type)) + | |
geom_density() + | |
centertitle() + | |
increasefont() + | |
ggtitle("Posterior Densities of | |
RE Standard Deviations") | |
# collect mean, median and sd of | |
# b_intercept, sd_BAT_ID, sd_PIT_ID, sigma | |
S1 <- S[1:4, 1:4] # means of main variables | |
# work with random effects | |
# find indices of re's for batters and for pitchers | |
b_ind <- grep("r\\_BAT", S$variable) | |
p_ind <- grep("r\\_PIT", S$variable) | |
# create data frames of random effect estimates | |
S_batter <- S[b_ind, c("variable", "mean")] | |
S_batter %>% | |
mutate(Var = gsub("r_BAT_ID\\[", "", variable), | |
Var = gsub(",Intercept\\]", "", Var)) %>% | |
select(Var, mean) -> S_batter | |
S_pitcher <- S[p_ind, c("variable", "mean")] | |
S_pitcher %>% | |
mutate(Var = gsub("r_PIT_ID\\[", "", variable), | |
Var = gsub(",Intercept\\]", "", Var)) %>% | |
select(Var, mean) -> S_pitcher | |
# collect PA and raw wOBA estimates | |
d2021a %>% | |
group_by(PIT_ID) %>% | |
summarize(PA = n(), | |
Estimate = mean(WT)) -> | |
pitcher_estimates | |
d2021a %>% | |
group_by(BAT_ID) %>% | |
summarize(PA = n(), | |
Estimate = mean(WT)) -> | |
batter_estimates | |
# merge raw and RE estimate tables | |
inner_join(batter_estimates, S_batter, | |
by = c("BAT_ID" = "Var")) -> | |
batter_estimates | |
inner_join(pitcher_estimates, S_pitcher, | |
by = c("PIT_ID" = "Var")) -> | |
pitcher_estimates | |
# some graphs -- first a couple of utility functions | |
centertitle <- function(){ | |
theme(plot.title = element_text(colour = "blue", | |
size = 18, | |
hjust = 0.5, | |
vjust = 0.8, | |
angle = 0)) | |
} | |
increasefont <-function(){ | |
theme(text=element_text(size=18)) | |
} | |
# scatterplot of raw wOBA estimates and random effects | |
ggplot(pitcher_estimates, | |
aes(Estimate, mean, color = PA)) + | |
geom_point() + | |
increasefont() | |
# plot of raw wOBA and random effects against PA | |
p1 <- ggplot(pitcher_estimates, aes(PA, Estimate)) + | |
geom_point() + | |
increasefont() + | |
centertitle() + | |
ggtitle("Raw Pitcher Estimates") | |
p2 <- ggplot(pitcher_estimates, aes(PA, mean)) + | |
geom_point() + | |
geom_smooth(color = "blue") + | |
geom_hline(yintercept = 0, color = "red") + | |
increasefont() + | |
centertitle() + | |
ggtitle("Pitcher Random Effects") + | |
ylab("Random Effect") | |
grid.arrange(p1, p2) | |
# do this for batters | |
q1 <- ggplot(batter_estimates, aes(PA, Estimate)) + | |
geom_point() + | |
increasefont() + | |
centertitle() + | |
ggtitle("Raw Batter Estimates") | |
q2 <- ggplot(batter_estimates, aes(PA, mean)) + | |
geom_point() + | |
geom_smooth(color = "blue") + | |
geom_hline(yintercept = 0, color = "red") + | |
increasefont() + | |
centertitle() + | |
ggtitle("Batter Random Effects") + | |
ylab("Random Effect") | |
grid.arrange(q1, q2) | |
# best pitchers and hitters | |
# first add names | |
inner_join(pitcher_estimates, | |
select(Master, retroID, | |
nameFirst, nameLast), | |
by = c("PIT_ID" = "retroID")) %>% | |
mutate(Name = paste(nameFirst, nameLast)) %>% | |
select(Name, PA, Estimate, mean) -> | |
pitcher_estimates | |
inner_join(batter_estimates, | |
select(Master, retroID, | |
nameFirst, nameLast), | |
by = c("BAT_ID" = "retroID")) %>% | |
mutate(Name = paste(nameFirst, nameLast)) %>% | |
select(Name, PA, Estimate, mean) -> | |
batter_estimates | |
# create top 10 lists | |
pitcher_estimates %>% | |
arrange(mean) %>% | |
head(10) | |
batter_estimates %>% | |
arrange(desc(mean)) %>% | |
head(10) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment