Created
April 19, 2020 17:03
-
-
Save bayesball/7de9f8992160bb9cda8fa98a49627c52 to your computer and use it in GitHub Desktop.
R Code for Computing Player HR Improvement Due to Changes in Launch Conditions.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# data frame scip reads all Statcast play by play for the 2015 through 2019 seasons | |
scip <- read_csv("~/Dropbox/2016 WORK/BLOG Baseball R/OTHER/StatcastData/5_seasons_inplay_b.csv") | |
# function all_work() does all the computation | |
# inputs are season1 - first season, season2 - second season, minBIP - min balls in play for both seasons | |
# two outputs: p is the ggplot2 representation of the graph of the changes in HR production | |
# d contains the data frame with the calculations for all players | |
all_work <- function(season1, season2, | |
minBIP = 300){ | |
require(tidyverse) | |
require(mgcv) | |
require(CalledStrike) | |
# function to implement the GAM random | |
# effects model | |
two_var_fit_re <- function(df){ | |
gam(HR ~ s(launch_angle, launch_speed) + | |
s(Batter, bs = "re"), | |
family = binomial, | |
data = df) -> fit | |
} | |
# find players with at least 300 BIP | |
# for both seasons | |
scip %>% | |
filter(Season == season1) %>% | |
group_by(batter) %>% | |
summarize(BIP = n()) %>% | |
filter(BIP >= minBIP) %>% | |
select(batter) %>% pull() -> players300a | |
scip %>% | |
filter(Season == season2) %>% | |
group_by(batter) %>% | |
summarize(BIP = n()) %>% | |
filter(BIP >= minBIP) %>% | |
select(batter) %>% pull() -> players300b | |
players300 <- intersect(players300a, | |
players300b) | |
N <- length(players300) | |
# create data frame of data for those players | |
# for both seasons | |
df300_twoyears <- filter(scip, | |
batter %in% players300, | |
Season %in% c(season1, season2)) %>% | |
mutate(Batter = factor(batter)) | |
df300_season1 <- filter(df300_twoyears, | |
Season == season1) | |
# run GAM fit to first season data | |
fit1 <- two_var_fit_re(df300_season1) | |
# extract the random effects | |
beta <- coef(fit1) | |
p <- length(beta) | |
REFF <- data.frame( | |
Batter = levels(df300_season1$Batter), | |
reff = coef(fit1)[(p - N + 1):p]) | |
row.names(REFF) <- NULL | |
# merge these season1 random effects | |
# with the season 2 dataset | |
df300_season2 <- filter(df300_twoyears, | |
Season == season2) %>% | |
select(Batter, HR, launch_angle, | |
launch_speed) %>% | |
inner_join(REFF, by = c("Batter")) -> | |
df300_season2 | |
# use season1 fit to predict HR probability | |
# for each BIP in season2 | |
df300_season2$Prob <- predict(fit1, | |
newdata = df300_season2, | |
type = "response") | |
# for each player, compute expected HR | |
# for single BIP in second season | |
df300_season2 %>% | |
group_by(Batter) %>% | |
summarize(RE = first(reff), | |
BIP2 = n(), | |
HR2 = sum(HR), | |
E1 = sum(Prob, na.rm = TRUE) / | |
BIP2) -> S2 | |
# for each player, find BIP and HR in | |
# first season | |
df300_season1 %>% | |
group_by(Batter) %>% | |
summarize(BIP1 = n(), | |
HR1 = sum(HR)) -> S1 | |
# merge the two datasets | |
S12 <- inner_join(S1, S2, | |
by = "Batter") | |
# graph the first season HR against the | |
# increase in HR due to the change in | |
# launch conditions in 2nd season | |
the_graph <- ggplot(S12, aes(HR1, BIP1 * E1 - HR1)) + | |
geom_point() + | |
geom_hline(yintercept = 0, color = "red") + | |
increasefont() + | |
ggtitle(paste(season1, "/", season2)) + | |
centertitle() + | |
xlab(paste(season1, "HR")) + | |
ylab(paste(season2, "Increase Due to LC")) | |
list(p = the_graph, d = S12) | |
} | |
out1 <- all_work(2015, 2016, 200) | |
out2 <- all_work(2016, 2017, 200) | |
out3 <- all_work(2017, 2018, 200) | |
out4 <- all_work(2018, 2019, 200) | |
save(out1, out2, out3, out4, | |
file = "twoyears2.Rdata") | |
#load("twoyears.Rdata") | |
#load("twoyears2.Rdata") | |
# check Chris Davis 448801 | |
out1$d %>% filter(as.character(Batter) == | |
"448801") | |
# proportion of positive increases for each | |
# pair of years | |
prop_pos <- function(out){ | |
out$d %>% | |
summarize(N = n(), | |
P = mean(BIP1 * E1 - HR1 > 0)) | |
} | |
prop_pos(out1) # 210, 0.643 | |
prop_pos(out2) # 210, 0.443 | |
prop_pos(out3) # 216, 0.569 | |
prop_pos(out4) # 193, 0.611 | |
dis_df <- data.frame( | |
Transition = c("2015/2016", "2016/2017", | |
"2017/2018", "2018/2019"), | |
Percentage_Positive = c(64.3, 44.3, | |
56.9, 61.1)) | |
######### | |
out1$p + increasefont(12) + | |
ylab("2016 Increase") -> p1 | |
out2$p + increasefont(12) + | |
ylab("2017 Increase") -> p2 | |
out3$p + increasefont(12) + | |
ylab("2018 Increase") -> p3 | |
out4$p + increasefont(12) + | |
ylab("2019 Increase") -> p4 | |
grid.arrange(p1, p2, p3, p4) | |
# put all data frames together | |
out1$d %>% | |
mutate(Transition = "2015/2016") -> df1 | |
out2$d %>% | |
mutate(Transition = "2016/2017") -> df2 | |
out3$d %>% | |
mutate(Transition = "2017/2018") -> df3 | |
out4$d %>% | |
mutate(Transition = "2018/2019") -> df4 | |
df_all <- rbind(df1, df2, df3, df4) | |
df_all %>% | |
mutate(Expected = BIP1 * E1, | |
Change = Expected - HR1) -> df_all | |
select(df_all, Batter, Transition, HR1, | |
Expected, Change) %>% | |
arrange(desc(Change)) %>% | |
head(5) -> top_df | |
select(df_all, Batter, Transition, HR1, | |
Expected, Change) %>% | |
arrange(Change) %>% | |
head(5) -> bottom_df | |
# players at top | |
top_df$Player <- | |
c("Mookie Betts", "Xander Bogaerts", | |
"Christian Yelich", | |
"J.D. Martinez", "Yonder Alonso") | |
bottom_df$Player <- | |
c("Mark Trumbo", "Khris Davis", | |
"Evan Longoria", "Jonathan Lucroy", | |
"Miguel Cabera") | |
select(top_df, Player, Transition, | |
HR1, Expected, Change) | |
select(bottom_df, Player, Transition, | |
HR1, Expected, Change) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment