Skip to content

Instantly share code, notes, and snippets.

@bayesball bayesball/thecountwork.R
Last active Oct 30, 2018

Embed
What would you like to do?
Work on finding the "true zone" in called strikes and balls.
# I have the data files statcast2017.csv and UmpAssign08to17.csv
# in the current working directory
# load in packages that I need for this work
library(ggrepel)
library(tidyverse)
library(gridExtra)
# read in the data
sc <- read_csv("statcast2017.csv")
ump <- read_csv("UmpAssign08to17.csv")
# data wrangling
sc %>% mutate(Strike = ifelse(description ==
"called_strike", 1, 0),
Ball = ifelse(description %in%
c("ball", "blocked_ball"), 1, 0),
Count = paste(balls, strikes, sep="-")) ->
sc
sc %>% filter(Ball == 1 | Strike == 1) -> sc_called
inner_join(sc_called, ump) -> sc_called
# function to implement the gam fitting
do_fit <- function(d){
require(mgcv)
gam(Strike ~ s(plate_x, plate_z),
family=binomial,
data=d)
}
##### function cplot()
##### predict and plot over grid
##### assuming gam fit stored in variable fit
cplot <- function(){
df_p <- expand.grid(plate_x = seq(-1.2, 1.2, length=50),
plate_z = seq(1, 4, length=50))
df_p$lp <- predict(fit, df_p)
df_p$Probability <- exp(df_p$lp) / (1 + exp(df_p$lp))
topKzone <- 3.5
botKzone <- 1.6
inKzone <- -0.85
outKzone <- 0.85
kZone <- data.frame(
x=c(inKzone, inKzone, outKzone, outKzone, inKzone),
y=c(botKzone, topKzone, topKzone, botKzone, botKzone)
)
ggplot(df_p) +
stat_contour(aes(x=plate_x, y=plate_z,
z=Probability,
color = ..level..),
breaks=c(.5, .9),
size=1.5) +
geom_path(aes(x, y), data=kZone,
lwd=1, col="red") +
xlim(-1.5, 1.5) +
ylim(1.0, 4.0) +
theme(
plot.title = element_text(
colour = "blue",
size = 16,
hjust = 0.5,
vjust = 0.8,
angle = 0
)
) + coord_fixed(ratio = 1)
}
##################################
# basic plot
fit <- do_fit(filter(sc_called,
stand == "R"))
p1 <- cplot() + ggtitle("Batting Right")
fit <- do_fit(filter(sc_called,
stand == "L"))
p2 <- cplot() + ggtitle("Batting Left")
grid.arrange(p1, p2, ncol=2)
# look at number of strikes effect
fit <- do_fit(filter(sc_called,
stand == "R",
strikes == 0))
p1 <- cplot() + ggtitle("Right, 0 Strikes")
fit <- do_fit(filter(sc_called,
stand == "R",
strikes == 2))
p2 <- cplot() + ggtitle("Right, 2 Strikes")
fit <- do_fit(filter(sc_called,
stand == "L",
strikes == 0))
p3 <- cplot() + ggtitle("Left, 0 Strikes")
fit <- do_fit(filter(sc_called,
stand == "L",
strikes == 2))
p4 <- cplot() + ggtitle("Left, 2 Strikes")
library(gridExtra)
grid.arrange(p1, p2, p3, p4,
top="True Strike Zones - 2017 Data ")
# find the number of strikes and balls for all umps
sc_called %>% group_by(UmpName) %>%
summarize(S = sum(Strike),
B = sum(Ball),
N = S + B) -> SU
ggplot(SU, aes(N, S / N, label=UmpName)) +
geom_point() +
geom_label_repel(data =
filter(SU, S/N > .333, N > 3000),
color="red") +
xlab("Number of Called Pitches") +
ylab("Proportion of Called Strikes") +
ggtitle("2017 Umpire Stats") +
theme(
plot.title = element_text(
colour = "blue",
size = 16,
hjust = 0.5,
vjust = 0.8,
angle = 0
)
)
# focus on the top four in calling strikes
topUmp <- filter(SU, S / N > .333, N > 4000)
fit <-do_fit(filter(sc_called,
UmpName == topUmp$UmpName[1],
stand == "R"))
p1a <- cplot() +
ggtitle(paste(topUmp$UmpName[1], "R"))
fit <-do_fit(filter(sc_called,
UmpName == topUmp$UmpName[1],
stand == "L"))
p1b <- cplot() +
ggtitle(paste(topUmp$UmpName[1], "L"))
fit <-do_fit(filter(sc_called,
UmpName == topUmp$UmpName[2],
stand == "R"))
p2a <- cplot() +
ggtitle(paste(topUmp$UmpName[2], "R"))
fit <-do_fit(filter(sc_called,
UmpName == topUmp$UmpName[2],
stand == "L"))
p2b <- cplot() +
ggtitle(paste(topUmp$UmpName[2], "L"))
fit <-do_fit(filter(sc_called,
UmpName == topUmp$UmpName[3],
stand == "R"))
p3a <- cplot() +
ggtitle(paste(topUmp$UmpName[3], "R"))
fit <-do_fit(filter(sc_called,
UmpName == topUmp$UmpName[3],
stand == "L"))
p3b <- cplot() +
ggtitle(paste(topUmp$UmpName[3], "L"))
fit <-do_fit(filter(sc_called,
UmpName == topUmp$UmpName[4],
stand == "R"))
p4a <- cplot() +
ggtitle(paste(topUmp$UmpName[4], "R"))
fit <-do_fit(filter(sc_called,
UmpName == topUmp$UmpName[4],
stand == "L"))
p4b <- cplot() +
ggtitle(paste(topUmp$UmpName[4], "L"))
grid.arrange(p1a, p1b, p2a, p2b, ncol=2)
grid.arrange(p3a, p3b, p4a, p4b, ncol=2)
@aaronbaggett

This comment has been minimized.

Copy link

commented Feb 7, 2018

Hi Jim, great post. Would you mind sharing the code you used to create statcast2017.csv?

@bayesball

This comment has been minimized.

Copy link
Owner Author

commented Feb 8, 2018

Aaron -- I just added a link to the script I used to get the Statcast data.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.