Skip to content

Instantly share code, notes, and snippets.

@bayesball
Last active March 16, 2022 15:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/3e050600042aefa614a476cf03d5326f to your computer and use it in GitHub Desktop.
Save bayesball/3e050600042aefa614a476cf03d5326f to your computer and use it in GitHub Desktop.
Fit of a nonnested multilevel model to wOBA data for the 2021 baseball season
# load in packages
library(readr)
library(dplyr)
library(brms)
library(ggplot2)
library(posterior)
library(gridExtra)
library(Lahman)
# load 2021 data and Fangraphs data
load("~/Dropbox/Google Drive/Retrosheet/pbp.2021.Rdata")
fg_guts <-
read_csv("https://raw.githubusercontent.com/bayesball/HomeRuns2021/main/fg_guts.csv")
# only consider batting events and collect
# key variables
d2021 %>%
filter(BAT_EVENT_FL == TRUE) %>%
select(PIT_ID, BAT_ID, EVENT_CD) %>%
mutate(Season = 2021) -> d2021a
# compute wOBA weights
d2021a %>%
inner_join(fg_guts, by = "Season") %>%
mutate(WT = wBB * (EVENT_CD %in% 14:15) +
wHBP * (EVENT_CD == 16) +
w1B * (EVENT_CD == 20) +
w2B * (EVENT_CD == 21) +
w3B * (EVENT_CD == 22) +
wHR * (EVENT_CD == 23)) %>%
select(PIT_ID, BAT_ID, EVENT_CD, WT) -> d2021a
# brms fit
fitB <- brm(WT ~ 1 + (1 | PIT_ID) + (1 | BAT_ID),
data = d2021a)
# collect draws and summarize
par_sim <- as_draws(fitB)
S <- summarize_draws(par_sim)
# plot density estimates of standard deviations
sim_PIT <- unlist(subset_draws(par_sim,
variable = "sd_PIT_ID__Intercept"))
sim_BAT <- unlist(subset_draws(par_sim,
variable = "sd_BAT_ID__Intercept"))
df <- rbind(data.frame(SD = sim_PIT, Type = "Pitcher"),
data.frame(SD = sim_BAT, Type = "Batter"))
ggplot(df, aes(SD, color = Type)) +
geom_density() +
centertitle() +
increasefont() +
ggtitle("Posterior Densities of
RE Standard Deviations")
# collect mean, median and sd of
# b_intercept, sd_BAT_ID, sd_PIT_ID, sigma
S1 <- S[1:4, 1:4] # means of main variables
# work with random effects
# find indices of re's for batters and for pitchers
b_ind <- grep("r\\_BAT", S$variable)
p_ind <- grep("r\\_PIT", S$variable)
# create data frames of random effect estimates
S_batter <- S[b_ind, c("variable", "mean")]
S_batter %>%
mutate(Var = gsub("r_BAT_ID\\[", "", variable),
Var = gsub(",Intercept\\]", "", Var)) %>%
select(Var, mean) -> S_batter
S_pitcher <- S[p_ind, c("variable", "mean")]
S_pitcher %>%
mutate(Var = gsub("r_PIT_ID\\[", "", variable),
Var = gsub(",Intercept\\]", "", Var)) %>%
select(Var, mean) -> S_pitcher
# collect PA and raw wOBA estimates
d2021a %>%
group_by(PIT_ID) %>%
summarize(PA = n(),
Estimate = mean(WT)) ->
pitcher_estimates
d2021a %>%
group_by(BAT_ID) %>%
summarize(PA = n(),
Estimate = mean(WT)) ->
batter_estimates
# merge raw and RE estimate tables
inner_join(batter_estimates, S_batter,
by = c("BAT_ID" = "Var")) ->
batter_estimates
inner_join(pitcher_estimates, S_pitcher,
by = c("PIT_ID" = "Var")) ->
pitcher_estimates
# some graphs -- first a couple of utility functions
centertitle <- function(){
theme(plot.title = element_text(colour = "blue",
size = 18,
hjust = 0.5,
vjust = 0.8,
angle = 0))
}
increasefont <-function(){
theme(text=element_text(size=18))
}
# scatterplot of raw wOBA estimates and random effects
ggplot(pitcher_estimates,
aes(Estimate, mean, color = PA)) +
geom_point() +
increasefont()
# plot of raw wOBA and random effects against PA
p1 <- ggplot(pitcher_estimates, aes(PA, Estimate)) +
geom_point() +
increasefont() +
centertitle() +
ggtitle("Raw Pitcher Estimates")
p2 <- ggplot(pitcher_estimates, aes(PA, mean)) +
geom_point() +
geom_smooth(color = "blue") +
geom_hline(yintercept = 0, color = "red") +
increasefont() +
centertitle() +
ggtitle("Pitcher Random Effects") +
ylab("Random Effect")
grid.arrange(p1, p2)
# do this for batters
q1 <- ggplot(batter_estimates, aes(PA, Estimate)) +
geom_point() +
increasefont() +
centertitle() +
ggtitle("Raw Batter Estimates")
q2 <- ggplot(batter_estimates, aes(PA, mean)) +
geom_point() +
geom_smooth(color = "blue") +
geom_hline(yintercept = 0, color = "red") +
increasefont() +
centertitle() +
ggtitle("Batter Random Effects") +
ylab("Random Effect")
grid.arrange(q1, q2)
# best pitchers and hitters
# first add names
inner_join(pitcher_estimates,
select(Master, retroID,
nameFirst, nameLast),
by = c("PIT_ID" = "retroID")) %>%
mutate(Name = paste(nameFirst, nameLast)) %>%
select(Name, PA, Estimate, mean) ->
pitcher_estimates
inner_join(batter_estimates,
select(Master, retroID,
nameFirst, nameLast),
by = c("BAT_ID" = "retroID")) %>%
mutate(Name = paste(nameFirst, nameLast)) %>%
select(Name, PA, Estimate, mean) ->
batter_estimates
# create top 10 lists
pitcher_estimates %>%
arrange(mean) %>%
head(10)
batter_estimates %>%
arrange(desc(mean)) %>%
head(10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment