Last active
July 15, 2020 21:45
-
-
Save bayesball/d6e72106759d50232334f2d27e1f34a2 to your computer and use it in GitHub Desktop.
Work on finding the "true zone" in called strikes and balls.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# I have the data files statcast2017.csv and UmpAssign08to17.csv | |
# in the current working directory | |
# load in packages that I need for this work | |
library(ggrepel) | |
library(tidyverse) | |
library(gridExtra) | |
# read in the data | |
sc <- read_csv("statcast2017.csv") | |
ump <- read_csv("UmpAssign08to17.csv") | |
# data wrangling | |
sc %>% mutate(Strike = ifelse(description == | |
"called_strike", 1, 0), | |
Ball = ifelse(description %in% | |
c("ball", "blocked_ball"), 1, 0), | |
Count = paste(balls, strikes, sep="-")) -> | |
sc | |
sc %>% filter(Ball == 1 | Strike == 1) -> sc_called | |
inner_join(sc_called, ump) -> sc_called | |
# function to implement the gam fitting | |
do_fit <- function(d){ | |
require(mgcv) | |
gam(Strike ~ s(plate_x, plate_z), | |
family=binomial, | |
data=d) | |
} | |
##### function cplot() | |
##### predict and plot over grid | |
##### assuming gam fit stored in variable fit | |
cplot <- function(){ | |
df_p <- expand.grid(plate_x = seq(-1.2, 1.2, length=50), | |
plate_z = seq(1, 4, length=50)) | |
df_p$lp <- predict(fit, df_p) | |
df_p$Probability <- exp(df_p$lp) / (1 + exp(df_p$lp)) | |
topKzone <- 3.5 | |
botKzone <- 1.6 | |
inKzone <- -0.85 | |
outKzone <- 0.85 | |
kZone <- data.frame( | |
x=c(inKzone, inKzone, outKzone, outKzone, inKzone), | |
y=c(botKzone, topKzone, topKzone, botKzone, botKzone) | |
) | |
ggplot(df_p) + | |
stat_contour(aes(x=plate_x, y=plate_z, | |
z=Probability, | |
color = ..level..), | |
breaks=c(.5, .9), | |
size=1.5) + | |
geom_path(aes(x, y), data=kZone, | |
lwd=1, col="red") + | |
xlim(-1.5, 1.5) + | |
ylim(1.0, 4.0) + | |
theme( | |
plot.title = element_text( | |
colour = "blue", | |
size = 16, | |
hjust = 0.5, | |
vjust = 0.8, | |
angle = 0 | |
) | |
) + coord_fixed(ratio = 1) | |
} | |
################################## | |
# basic plot | |
fit <- do_fit(filter(sc_called, | |
stand == "R")) | |
p1 <- cplot() + ggtitle("Batting Right") | |
fit <- do_fit(filter(sc_called, | |
stand == "L")) | |
p2 <- cplot() + ggtitle("Batting Left") | |
grid.arrange(p1, p2, ncol=2) | |
# look at number of strikes effect | |
fit <- do_fit(filter(sc_called, | |
stand == "R", | |
strikes == 0)) | |
p1 <- cplot() + ggtitle("Right, 0 Strikes") | |
fit <- do_fit(filter(sc_called, | |
stand == "R", | |
strikes == 2)) | |
p2 <- cplot() + ggtitle("Right, 2 Strikes") | |
fit <- do_fit(filter(sc_called, | |
stand == "L", | |
strikes == 0)) | |
p3 <- cplot() + ggtitle("Left, 0 Strikes") | |
fit <- do_fit(filter(sc_called, | |
stand == "L", | |
strikes == 2)) | |
p4 <- cplot() + ggtitle("Left, 2 Strikes") | |
library(gridExtra) | |
grid.arrange(p1, p2, p3, p4, | |
top="True Strike Zones - 2017 Data ") | |
# find the number of strikes and balls for all umps | |
sc_called %>% group_by(UmpName) %>% | |
summarize(S = sum(Strike), | |
B = sum(Ball), | |
N = S + B) -> SU | |
ggplot(SU, aes(N, S / N, label=UmpName)) + | |
geom_point() + | |
geom_label_repel(data = | |
filter(SU, S/N > .333, N > 3000), | |
color="red") + | |
xlab("Number of Called Pitches") + | |
ylab("Proportion of Called Strikes") + | |
ggtitle("2017 Umpire Stats") + | |
theme( | |
plot.title = element_text( | |
colour = "blue", | |
size = 16, | |
hjust = 0.5, | |
vjust = 0.8, | |
angle = 0 | |
) | |
) | |
# focus on the top four in calling strikes | |
topUmp <- filter(SU, S / N > .333, N > 4000) | |
fit <-do_fit(filter(sc_called, | |
UmpName == topUmp$UmpName[1], | |
stand == "R")) | |
p1a <- cplot() + | |
ggtitle(paste(topUmp$UmpName[1], "R")) | |
fit <-do_fit(filter(sc_called, | |
UmpName == topUmp$UmpName[1], | |
stand == "L")) | |
p1b <- cplot() + | |
ggtitle(paste(topUmp$UmpName[1], "L")) | |
fit <-do_fit(filter(sc_called, | |
UmpName == topUmp$UmpName[2], | |
stand == "R")) | |
p2a <- cplot() + | |
ggtitle(paste(topUmp$UmpName[2], "R")) | |
fit <-do_fit(filter(sc_called, | |
UmpName == topUmp$UmpName[2], | |
stand == "L")) | |
p2b <- cplot() + | |
ggtitle(paste(topUmp$UmpName[2], "L")) | |
fit <-do_fit(filter(sc_called, | |
UmpName == topUmp$UmpName[3], | |
stand == "R")) | |
p3a <- cplot() + | |
ggtitle(paste(topUmp$UmpName[3], "R")) | |
fit <-do_fit(filter(sc_called, | |
UmpName == topUmp$UmpName[3], | |
stand == "L")) | |
p3b <- cplot() + | |
ggtitle(paste(topUmp$UmpName[3], "L")) | |
fit <-do_fit(filter(sc_called, | |
UmpName == topUmp$UmpName[4], | |
stand == "R")) | |
p4a <- cplot() + | |
ggtitle(paste(topUmp$UmpName[4], "R")) | |
fit <-do_fit(filter(sc_called, | |
UmpName == topUmp$UmpName[4], | |
stand == "L")) | |
p4b <- cplot() + | |
ggtitle(paste(topUmp$UmpName[4], "L")) | |
grid.arrange(p1a, p1b, p2a, p2b, ncol=2) | |
grid.arrange(p3a, p3b, p4a, p4b, ncol=2) |
Aaron -- I just added a link to the script I used to get the Statcast data.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi Jim, great post. Would you mind sharing the code you used to create
statcast2017.csv
?