bayesball/nonnested_fit.R

## nonnested_fit.R
# load in packages

library(readr)
library(dplyr)
library(brms)
library(ggplot2)
library(posterior)
library(gridExtra)
library(Lahman)

# load 2021 data and Fangraphs data

load("~/Dropbox/Google Drive/Retrosheet/pbp.2021.Rdata")
fg_guts <-
  read_csv("https://raw.githubusercontent.com/bayesball/HomeRuns2021/main/fg_guts.csv")

# only consider batting events and collect
# key variables

d2021 %>%
  filter(BAT_EVENT_FL == TRUE) %>%
  select(PIT_ID, BAT_ID, EVENT_CD) %>%
  mutate(Season = 2021) -> d2021a

# compute wOBA weights

d2021a %>%
  inner_join(fg_guts, by = "Season") %>%
  mutate(WT = wBB * (EVENT_CD %in% 14:15) +
           wHBP * (EVENT_CD == 16) +
           w1B * (EVENT_CD == 20) +
           w2B * (EVENT_CD == 21) +
           w3B * (EVENT_CD == 22) +
           wHR * (EVENT_CD == 23)) %>%
  select(PIT_ID, BAT_ID, EVENT_CD, WT) -> d2021a

# brms fit

fitB <- brm(WT ~ 1 + (1 | PIT_ID) + (1 | BAT_ID),
            data = d2021a)

# collect draws and summarize

par_sim <- as_draws(fitB)
S <- summarize_draws(par_sim)

# plot density estimates of standard deviations

sim_PIT <- unlist(subset_draws(par_sim,
                  variable = "sd_PIT_ID__Intercept"))
sim_BAT <- unlist(subset_draws(par_sim,
                  variable = "sd_BAT_ID__Intercept"))

df <- rbind(data.frame(SD = sim_PIT, Type = "Pitcher"),
            data.frame(SD = sim_BAT, Type = "Batter"))

ggplot(df, aes(SD, color = Type)) +
  geom_density() +
  centertitle() +
  increasefont() +
  ggtitle("Posterior Densities of
  RE Standard Deviations")

# collect mean, median and sd of
# b_intercept, sd_BAT_ID, sd_PIT_ID, sigma

S1 <- S[1:4, 1:4] # means of main variables

# work with random effects

# find indices of re's for batters and for pitchers
b_ind <- grep("r\\_BAT", S$variable)
p_ind <- grep("r\\_PIT", S$variable)

# create data frames of random effect estimates
S_batter <- S[b_ind, c("variable", "mean")]
S_batter %>%
  mutate(Var = gsub("r_BAT_ID\\[", "", variable),
         Var = gsub(",Intercept\\]", "", Var)) %>%
  select(Var, mean) -> S_batter

S_pitcher <- S[p_ind, c("variable", "mean")]
S_pitcher %>%
  mutate(Var = gsub("r_PIT_ID\\[", "", variable),
         Var = gsub(",Intercept\\]", "", Var)) %>%
  select(Var, mean) -> S_pitcher

# collect PA and raw wOBA estimates
d2021a %>%
  group_by(PIT_ID) %>%
  summarize(PA = n(),
            Estimate = mean(WT)) ->
  pitcher_estimates

d2021a %>%
  group_by(BAT_ID) %>%
  summarize(PA = n(),
            Estimate = mean(WT)) ->
  batter_estimates

# merge raw and RE estimate tables

inner_join(batter_estimates, S_batter,
           by = c("BAT_ID" = "Var")) ->
  batter_estimates

inner_join(pitcher_estimates, S_pitcher,
           by = c("PIT_ID" = "Var")) ->
  pitcher_estimates

# some graphs -- first a couple of utility functions

centertitle <- function(){
  theme(plot.title = element_text(colour = "blue",
                                  size = 18,
                                  hjust = 0.5,
                                  vjust = 0.8,
                                  angle = 0))
}
increasefont <-function(){
  theme(text=element_text(size=18))
}

# scatterplot of raw wOBA estimates and random effects
ggplot(pitcher_estimates,
       aes(Estimate, mean, color = PA)) +
  geom_point() +
  increasefont()

# plot of raw wOBA and random effects against PA
p1 <- ggplot(pitcher_estimates, aes(PA, Estimate)) +
  geom_point() +
  increasefont() +
  centertitle() +
  ggtitle("Raw Pitcher Estimates")

p2 <- ggplot(pitcher_estimates, aes(PA, mean)) +
  geom_point() +
  geom_smooth(color = "blue") +
  geom_hline(yintercept = 0, color = "red") +
  increasefont() +
  centertitle() +
  ggtitle("Pitcher Random Effects") +
  ylab("Random Effect")

grid.arrange(p1, p2)

# do this for batters

q1 <- ggplot(batter_estimates, aes(PA, Estimate)) +
  geom_point() +
  increasefont() +
  centertitle() +
  ggtitle("Raw Batter Estimates")

q2 <- ggplot(batter_estimates, aes(PA, mean)) +
  geom_point() +
  geom_smooth(color = "blue") +
  geom_hline(yintercept = 0, color = "red") +
  increasefont() +
  centertitle() +
  ggtitle("Batter Random Effects") +
  ylab("Random Effect")

grid.arrange(q1, q2)

# best pitchers and hitters

# first add names

inner_join(pitcher_estimates,
           select(Master, retroID,
                  nameFirst, nameLast),
           by = c("PIT_ID" = "retroID")) %>%
  mutate(Name = paste(nameFirst, nameLast)) %>%
  select(Name, PA, Estimate, mean) ->
  pitcher_estimates

inner_join(batter_estimates,
           select(Master, retroID,
                  nameFirst, nameLast),
           by = c("BAT_ID" = "retroID")) %>%
  mutate(Name = paste(nameFirst, nameLast)) %>%
  select(Name, PA, Estimate, mean) ->
  batter_estimates

# create top 10 lists

pitcher_estimates %>%
  arrange(mean) %>%
  head(10)

batter_estimates %>%
  arrange(desc(mean)) %>%
  head(10)
	# load in packages

	library(readr)
	library(dplyr)
	library(brms)
	library(ggplot2)
	library(posterior)
	library(gridExtra)
	library(Lahman)

	# load 2021 data and Fangraphs data

	load("~/Dropbox/Google Drive/Retrosheet/pbp.2021.Rdata")
	fg_guts <-
	read_csv("https://raw.githubusercontent.com/bayesball/HomeRuns2021/main/fg_guts.csv")

	# only consider batting events and collect
	# key variables

	d2021 %>%
	filter(BAT_EVENT_FL == TRUE) %>%
	select(PIT_ID, BAT_ID, EVENT_CD) %>%
	mutate(Season = 2021) -> d2021a

	# compute wOBA weights

	d2021a %>%
	inner_join(fg_guts, by = "Season") %>%
	mutate(WT = wBB * (EVENT_CD %in% 14:15) +
	wHBP * (EVENT_CD == 16) +
	w1B * (EVENT_CD == 20) +
	w2B * (EVENT_CD == 21) +
	w3B * (EVENT_CD == 22) +
	wHR * (EVENT_CD == 23)) %>%
	select(PIT_ID, BAT_ID, EVENT_CD, WT) -> d2021a

	# brms fit

	fitB <- brm(WT ~ 1 + (1 \| PIT_ID) + (1 \| BAT_ID),
	data = d2021a)

	# collect draws and summarize

	par_sim <- as_draws(fitB)
	S <- summarize_draws(par_sim)

	# plot density estimates of standard deviations

	sim_PIT <- unlist(subset_draws(par_sim,
	variable = "sd_PIT_ID__Intercept"))
	sim_BAT <- unlist(subset_draws(par_sim,
	variable = "sd_BAT_ID__Intercept"))

	df <- rbind(data.frame(SD = sim_PIT, Type = "Pitcher"),
	data.frame(SD = sim_BAT, Type = "Batter"))

	ggplot(df, aes(SD, color = Type)) +
	geom_density() +
	centertitle() +
	increasefont() +
	ggtitle("Posterior Densities of
	RE Standard Deviations")

	# collect mean, median and sd of
	# b_intercept, sd_BAT_ID, sd_PIT_ID, sigma

	S1 <- S[1:4, 1:4] # means of main variables

	# work with random effects

	# find indices of re's for batters and for pitchers
	b_ind <- grep("r\\_BAT", S$variable)
	p_ind <- grep("r\\_PIT", S$variable)

	# create data frames of random effect estimates
	S_batter <- S[b_ind, c("variable", "mean")]
	S_batter %>%
	mutate(Var = gsub("r_BAT_ID\\[", "", variable),
	Var = gsub(",Intercept\\]", "", Var)) %>%
	select(Var, mean) -> S_batter

	S_pitcher <- S[p_ind, c("variable", "mean")]
	S_pitcher %>%
	mutate(Var = gsub("r_PIT_ID\\[", "", variable),
	Var = gsub(",Intercept\\]", "", Var)) %>%
	select(Var, mean) -> S_pitcher

	# collect PA and raw wOBA estimates
	d2021a %>%
	group_by(PIT_ID) %>%
	summarize(PA = n(),
	Estimate = mean(WT)) ->
	pitcher_estimates

	d2021a %>%
	group_by(BAT_ID) %>%
	summarize(PA = n(),
	Estimate = mean(WT)) ->
	batter_estimates

	# merge raw and RE estimate tables

	inner_join(batter_estimates, S_batter,
	by = c("BAT_ID" = "Var")) ->
	batter_estimates

	inner_join(pitcher_estimates, S_pitcher,
	by = c("PIT_ID" = "Var")) ->
	pitcher_estimates

	# some graphs -- first a couple of utility functions

	centertitle <- function(){
	theme(plot.title = element_text(colour = "blue",
	size = 18,
	hjust = 0.5,
	vjust = 0.8,
	angle = 0))
	}
	increasefont <-function(){
	theme(text=element_text(size=18))
	}

	# scatterplot of raw wOBA estimates and random effects
	ggplot(pitcher_estimates,
	aes(Estimate, mean, color = PA)) +
	geom_point() +
	increasefont()

	# plot of raw wOBA and random effects against PA
	p1 <- ggplot(pitcher_estimates, aes(PA, Estimate)) +
	geom_point() +
	increasefont() +
	centertitle() +
	ggtitle("Raw Pitcher Estimates")

	p2 <- ggplot(pitcher_estimates, aes(PA, mean)) +
	geom_point() +
	geom_smooth(color = "blue") +
	geom_hline(yintercept = 0, color = "red") +
	increasefont() +
	centertitle() +
	ggtitle("Pitcher Random Effects") +
	ylab("Random Effect")

	grid.arrange(p1, p2)

	# do this for batters

	q1 <- ggplot(batter_estimates, aes(PA, Estimate)) +
	geom_point() +
	increasefont() +
	centertitle() +
	ggtitle("Raw Batter Estimates")

	q2 <- ggplot(batter_estimates, aes(PA, mean)) +
	geom_point() +
	geom_smooth(color = "blue") +
	geom_hline(yintercept = 0, color = "red") +
	increasefont() +
	centertitle() +
	ggtitle("Batter Random Effects") +
	ylab("Random Effect")

	grid.arrange(q1, q2)

	# best pitchers and hitters

	# first add names

	inner_join(pitcher_estimates,
	select(Master, retroID,
	nameFirst, nameLast),
	by = c("PIT_ID" = "retroID")) %>%
	mutate(Name = paste(nameFirst, nameLast)) %>%
	select(Name, PA, Estimate, mean) ->
	pitcher_estimates

	inner_join(batter_estimates,
	select(Master, retroID,
	nameFirst, nameLast),
	by = c("BAT_ID" = "retroID")) %>%
	mutate(Name = paste(nameFirst, nameLast)) %>%
	select(Name, PA, Estimate, mean) ->
	batter_estimates

	# create top 10 lists

	pitcher_estimates %>%
	arrange(mean) %>%
	head(10)

	batter_estimates %>%
	arrange(desc(mean)) %>%
	head(10)