Skip to content

Instantly share code, notes, and snippets.

@bayesball
Created January 1, 2019 18:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/39d493fe1b4897d6065b6f68a8285811 to your computer and use it in GitHub Desktop.
Save bayesball/39d493fe1b4897d6065b6f68a8285811 to your computer and use it in GitHub Desktop.
Explores impact of length of plate appearance using 2018 Retrosheet data
# Load tidyverse packages and read in the Retrosheet data
library(tidyverse)
load("~/Dropbox/Google Drive/Retrosheet/pbp.2018.Rdata")
# if you have trouble getting the 2018 Retrosheet data, you can
# use several complete Retrosheet datasets from previous seasons
# see http://www-math.bgsu.edu/~albert/retrosheet/
# Create new variables: pseq containing only the pitch
# results, pseq_length gives the length of the PA, and
# Count records the ball strike count
d2018 %>%
mutate(pseq = str_remove_all(PITCH_SEQ_TX,
"[.>123N+*]"),
pseq_length = str_length(pseq),
Count = paste(BALLS_CT, STRIKES_CT, sep="-")) -> d2018
# Only want to consider PA's where the PA is completed
end_of_pa_code <- c(2, 3, 5, 13:24)
d2018 %>% filter(pseq_length > 0,
EVENT_CD %in% end_of_pa_code) -> d2018a
# Find Brandon Belt's long PA in 2018
library(Lahman)
d2018a %>% filter(pseq_length == max(pseq_length)) %>%
inner_join(select(Master, nameFirst, nameLast, retroID),
by = c("BAT_ID" = "retroID")) %>%
select(GAME_ID, nameFirst, nameLast, pseq, pseq_length, EVENT_CD)
# For each possible length of PA, find the number of PAs, the
# mean run value and the standard deviation of the run values
d2018a %>% group_by(pseq_length) %>%
summarize(N = n(),
M = mean(RUNS.VALUE),
S = sd(RUNS.VALUE)) -> S_count
# Plot of mean value of PA as function of number of pitches
ggplot(filter(S_count, pseq_length <= 9),
aes(pseq_length, M)) +
geom_point(size = 2) +
geom_hline(yintercept = 0, color = "red") +
scale_x_continuous(breaks=1:9) +
xlab("Number of Pitches") +
ylab("Mean Run Value") +
theme(plot.title = element_text(colour = "blue",
size = 18, hjust = 0.5)) +
ggtitle("Value of Plate Appearance\nas Function of Number of Pitches")
# Plot of standard deviation of value
ggplot(filter(S_count, pseq_length <= 9),
aes(pseq_length, S)) +
geom_point() +
xlab("Number of Pitches") +
ylab("SD Run Value") +
ggtitle("Value of Plate Appearance as Function of Number of Pitches")
######### Summarize for each PA length and Count
d2018a %>% group_by(pseq_length, Count) %>%
summarize(N = n(),
M = mean(RUNS.VALUE),
S = sd(RUNS.VALUE)) %>%
filter(N >= 10) ->
new_summ
## This adds the count info to the previous plot
ggplot(filter(new_summ, pseq_length <= 9),
aes(pseq_length, M, label = Count)) +
geom_label(color = "blue", size = 5) +
geom_hline(yintercept = 0, color = "red") +
scale_x_continuous(breaks=1:9) +
xlab("Number of Pitches") +
ylab("Mean Run Value") +
theme(plot.title = element_text(colour = "blue",
size = 18, hjust = 0.5)) +
ggtitle("Value of Plate Appearance\nas Function of # of Pitches and Count")
# Find the HR rate for each length of PA and count
d2018a %>%
group_by(pseq_length, Count) %>%
summarize(N = n(),
HR = sum(EVENT_CD == 23),
HR_Rate = mean(EVENT_CD == 23)) -> HR
# graph the HR rates against PA length
ggplot(filter(HR, pseq_length <= 9, HR > 9),
aes(pseq_length, HR_Rate,
label = Count)) +
geom_label(color = "blue", size = 5) +
scale_x_continuous(breaks=1:9) +
xlab("Number of Pitches") +
ylab("Home Run Rate") +
theme(plot.title = element_text(colour = "blue",
size = 18, hjust = 0.5)) +
ggtitle("Home Run Rate\nas Function of Number of Pitches")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment