Skip to content

Instantly share code, notes, and snippets.

@bayesball
Last active July 15, 2020 21:45
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/d6e72106759d50232334f2d27e1f34a2 to your computer and use it in GitHub Desktop.
Save bayesball/d6e72106759d50232334f2d27e1f34a2 to your computer and use it in GitHub Desktop.
Work on finding the "true zone" in called strikes and balls.
# I have the data files statcast2017.csv and UmpAssign08to17.csv
# in the current working directory
# load in packages that I need for this work
library(ggrepel)
library(tidyverse)
library(gridExtra)
# read in the data
sc <- read_csv("statcast2017.csv")
ump <- read_csv("UmpAssign08to17.csv")
# data wrangling
sc %>% mutate(Strike = ifelse(description ==
"called_strike", 1, 0),
Ball = ifelse(description %in%
c("ball", "blocked_ball"), 1, 0),
Count = paste(balls, strikes, sep="-")) ->
sc
sc %>% filter(Ball == 1 | Strike == 1) -> sc_called
inner_join(sc_called, ump) -> sc_called
# function to implement the gam fitting
do_fit <- function(d){
require(mgcv)
gam(Strike ~ s(plate_x, plate_z),
family=binomial,
data=d)
}
##### function cplot()
##### predict and plot over grid
##### assuming gam fit stored in variable fit
cplot <- function(){
df_p <- expand.grid(plate_x = seq(-1.2, 1.2, length=50),
plate_z = seq(1, 4, length=50))
df_p$lp <- predict(fit, df_p)
df_p$Probability <- exp(df_p$lp) / (1 + exp(df_p$lp))
topKzone <- 3.5
botKzone <- 1.6
inKzone <- -0.85
outKzone <- 0.85
kZone <- data.frame(
x=c(inKzone, inKzone, outKzone, outKzone, inKzone),
y=c(botKzone, topKzone, topKzone, botKzone, botKzone)
)
ggplot(df_p) +
stat_contour(aes(x=plate_x, y=plate_z,
z=Probability,
color = ..level..),
breaks=c(.5, .9),
size=1.5) +
geom_path(aes(x, y), data=kZone,
lwd=1, col="red") +
xlim(-1.5, 1.5) +
ylim(1.0, 4.0) +
theme(
plot.title = element_text(
colour = "blue",
size = 16,
hjust = 0.5,
vjust = 0.8,
angle = 0
)
) + coord_fixed(ratio = 1)
}
##################################
# basic plot
fit <- do_fit(filter(sc_called,
stand == "R"))
p1 <- cplot() + ggtitle("Batting Right")
fit <- do_fit(filter(sc_called,
stand == "L"))
p2 <- cplot() + ggtitle("Batting Left")
grid.arrange(p1, p2, ncol=2)
# look at number of strikes effect
fit <- do_fit(filter(sc_called,
stand == "R",
strikes == 0))
p1 <- cplot() + ggtitle("Right, 0 Strikes")
fit <- do_fit(filter(sc_called,
stand == "R",
strikes == 2))
p2 <- cplot() + ggtitle("Right, 2 Strikes")
fit <- do_fit(filter(sc_called,
stand == "L",
strikes == 0))
p3 <- cplot() + ggtitle("Left, 0 Strikes")
fit <- do_fit(filter(sc_called,
stand == "L",
strikes == 2))
p4 <- cplot() + ggtitle("Left, 2 Strikes")
library(gridExtra)
grid.arrange(p1, p2, p3, p4,
top="True Strike Zones - 2017 Data ")
# find the number of strikes and balls for all umps
sc_called %>% group_by(UmpName) %>%
summarize(S = sum(Strike),
B = sum(Ball),
N = S + B) -> SU
ggplot(SU, aes(N, S / N, label=UmpName)) +
geom_point() +
geom_label_repel(data =
filter(SU, S/N > .333, N > 3000),
color="red") +
xlab("Number of Called Pitches") +
ylab("Proportion of Called Strikes") +
ggtitle("2017 Umpire Stats") +
theme(
plot.title = element_text(
colour = "blue",
size = 16,
hjust = 0.5,
vjust = 0.8,
angle = 0
)
)
# focus on the top four in calling strikes
topUmp <- filter(SU, S / N > .333, N > 4000)
fit <-do_fit(filter(sc_called,
UmpName == topUmp$UmpName[1],
stand == "R"))
p1a <- cplot() +
ggtitle(paste(topUmp$UmpName[1], "R"))
fit <-do_fit(filter(sc_called,
UmpName == topUmp$UmpName[1],
stand == "L"))
p1b <- cplot() +
ggtitle(paste(topUmp$UmpName[1], "L"))
fit <-do_fit(filter(sc_called,
UmpName == topUmp$UmpName[2],
stand == "R"))
p2a <- cplot() +
ggtitle(paste(topUmp$UmpName[2], "R"))
fit <-do_fit(filter(sc_called,
UmpName == topUmp$UmpName[2],
stand == "L"))
p2b <- cplot() +
ggtitle(paste(topUmp$UmpName[2], "L"))
fit <-do_fit(filter(sc_called,
UmpName == topUmp$UmpName[3],
stand == "R"))
p3a <- cplot() +
ggtitle(paste(topUmp$UmpName[3], "R"))
fit <-do_fit(filter(sc_called,
UmpName == topUmp$UmpName[3],
stand == "L"))
p3b <- cplot() +
ggtitle(paste(topUmp$UmpName[3], "L"))
fit <-do_fit(filter(sc_called,
UmpName == topUmp$UmpName[4],
stand == "R"))
p4a <- cplot() +
ggtitle(paste(topUmp$UmpName[4], "R"))
fit <-do_fit(filter(sc_called,
UmpName == topUmp$UmpName[4],
stand == "L"))
p4b <- cplot() +
ggtitle(paste(topUmp$UmpName[4], "L"))
grid.arrange(p1a, p1b, p2a, p2b, ncol=2)
grid.arrange(p3a, p3b, p4a, p4b, ncol=2)
@aaronbaggett
Copy link

Hi Jim, great post. Would you mind sharing the code you used to create statcast2017.csv?

@bayesball
Copy link
Author

Aaron -- I just added a link to the script I used to get the Statcast data.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment