Skip to content

Instantly share code, notes, and snippets.

@bayesball
Created April 19, 2020 17:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/7de9f8992160bb9cda8fa98a49627c52 to your computer and use it in GitHub Desktop.
Save bayesball/7de9f8992160bb9cda8fa98a49627c52 to your computer and use it in GitHub Desktop.
R Code for Computing Player HR Improvement Due to Changes in Launch Conditions.
# data frame scip reads all Statcast play by play for the 2015 through 2019 seasons
scip <- read_csv("~/Dropbox/2016 WORK/BLOG Baseball R/OTHER/StatcastData/5_seasons_inplay_b.csv")
# function all_work() does all the computation
# inputs are season1 - first season, season2 - second season, minBIP - min balls in play for both seasons
# two outputs: p is the ggplot2 representation of the graph of the changes in HR production
# d contains the data frame with the calculations for all players
all_work <- function(season1, season2,
minBIP = 300){
require(tidyverse)
require(mgcv)
require(CalledStrike)
# function to implement the GAM random
# effects model
two_var_fit_re <- function(df){
gam(HR ~ s(launch_angle, launch_speed) +
s(Batter, bs = "re"),
family = binomial,
data = df) -> fit
}
# find players with at least 300 BIP
# for both seasons
scip %>%
filter(Season == season1) %>%
group_by(batter) %>%
summarize(BIP = n()) %>%
filter(BIP >= minBIP) %>%
select(batter) %>% pull() -> players300a
scip %>%
filter(Season == season2) %>%
group_by(batter) %>%
summarize(BIP = n()) %>%
filter(BIP >= minBIP) %>%
select(batter) %>% pull() -> players300b
players300 <- intersect(players300a,
players300b)
N <- length(players300)
# create data frame of data for those players
# for both seasons
df300_twoyears <- filter(scip,
batter %in% players300,
Season %in% c(season1, season2)) %>%
mutate(Batter = factor(batter))
df300_season1 <- filter(df300_twoyears,
Season == season1)
# run GAM fit to first season data
fit1 <- two_var_fit_re(df300_season1)
# extract the random effects
beta <- coef(fit1)
p <- length(beta)
REFF <- data.frame(
Batter = levels(df300_season1$Batter),
reff = coef(fit1)[(p - N + 1):p])
row.names(REFF) <- NULL
# merge these season1 random effects
# with the season 2 dataset
df300_season2 <- filter(df300_twoyears,
Season == season2) %>%
select(Batter, HR, launch_angle,
launch_speed) %>%
inner_join(REFF, by = c("Batter")) ->
df300_season2
# use season1 fit to predict HR probability
# for each BIP in season2
df300_season2$Prob <- predict(fit1,
newdata = df300_season2,
type = "response")
# for each player, compute expected HR
# for single BIP in second season
df300_season2 %>%
group_by(Batter) %>%
summarize(RE = first(reff),
BIP2 = n(),
HR2 = sum(HR),
E1 = sum(Prob, na.rm = TRUE) /
BIP2) -> S2
# for each player, find BIP and HR in
# first season
df300_season1 %>%
group_by(Batter) %>%
summarize(BIP1 = n(),
HR1 = sum(HR)) -> S1
# merge the two datasets
S12 <- inner_join(S1, S2,
by = "Batter")
# graph the first season HR against the
# increase in HR due to the change in
# launch conditions in 2nd season
the_graph <- ggplot(S12, aes(HR1, BIP1 * E1 - HR1)) +
geom_point() +
geom_hline(yintercept = 0, color = "red") +
increasefont() +
ggtitle(paste(season1, "/", season2)) +
centertitle() +
xlab(paste(season1, "HR")) +
ylab(paste(season2, "Increase Due to LC"))
list(p = the_graph, d = S12)
}
out1 <- all_work(2015, 2016, 200)
out2 <- all_work(2016, 2017, 200)
out3 <- all_work(2017, 2018, 200)
out4 <- all_work(2018, 2019, 200)
save(out1, out2, out3, out4,
file = "twoyears2.Rdata")
#load("twoyears.Rdata")
#load("twoyears2.Rdata")
# check Chris Davis 448801
out1$d %>% filter(as.character(Batter) ==
"448801")
# proportion of positive increases for each
# pair of years
prop_pos <- function(out){
out$d %>%
summarize(N = n(),
P = mean(BIP1 * E1 - HR1 > 0))
}
prop_pos(out1) # 210, 0.643
prop_pos(out2) # 210, 0.443
prop_pos(out3) # 216, 0.569
prop_pos(out4) # 193, 0.611
dis_df <- data.frame(
Transition = c("2015/2016", "2016/2017",
"2017/2018", "2018/2019"),
Percentage_Positive = c(64.3, 44.3,
56.9, 61.1))
#########
out1$p + increasefont(12) +
ylab("2016 Increase") -> p1
out2$p + increasefont(12) +
ylab("2017 Increase") -> p2
out3$p + increasefont(12) +
ylab("2018 Increase") -> p3
out4$p + increasefont(12) +
ylab("2019 Increase") -> p4
grid.arrange(p1, p2, p3, p4)
# put all data frames together
out1$d %>%
mutate(Transition = "2015/2016") -> df1
out2$d %>%
mutate(Transition = "2016/2017") -> df2
out3$d %>%
mutate(Transition = "2017/2018") -> df3
out4$d %>%
mutate(Transition = "2018/2019") -> df4
df_all <- rbind(df1, df2, df3, df4)
df_all %>%
mutate(Expected = BIP1 * E1,
Change = Expected - HR1) -> df_all
select(df_all, Batter, Transition, HR1,
Expected, Change) %>%
arrange(desc(Change)) %>%
head(5) -> top_df
select(df_all, Batter, Transition, HR1,
Expected, Change) %>%
arrange(Change) %>%
head(5) -> bottom_df
# players at top
top_df$Player <-
c("Mookie Betts", "Xander Bogaerts",
"Christian Yelich",
"J.D. Martinez", "Yonder Alonso")
bottom_df$Player <-
c("Mark Trumbo", "Khris Davis",
"Evan Longoria", "Jonathan Lucroy",
"Miguel Cabera")
select(top_df, Player, Transition,
HR1, Expected, Change)
select(bottom_df, Player, Transition,
HR1, Expected, Change)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment