Created
October 22, 2017 21:47
-
-
Save bayesball/0a1baaf8054253ea5097d206d7c8d0ec to your computer and use it in GitHub Desktop.
R code to explore swing and miss rates of 2017 World Series teams
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Read in pitch-by-pitch data for | |
# all Dodgers and Astros regulars in 2017 season | |
# data is downloaded into csv files that I read into R | |
library(tidyverse) | |
d1 <- read_csv("astros.csv") | |
d3 <- read_csv("dodgers.csv") | |
d1$Team <- "Astros" | |
d3$Team <- "Dodgers" | |
d13 <- rbind(d1, d3) | |
# Define swing and miss binary (0 and 1) variables | |
d13 %>% mutate(Swing=ifelse(description %in% | |
c("foul", "foul_tip", "hit_in_play", | |
"hit_into_play_no_out", | |
"hit_into_play_score", | |
"swinging_strike", | |
"swinging_strike_blocked"), | |
1, 0), | |
Miss=ifelse(description %in% | |
c("swinging_strike", | |
"swinging_strike_blocked"), | |
1, 0)) -> d13 | |
# Compute swing and miss rates for all players | |
library(stringr) | |
d13 %>% group_by(player_name) %>% | |
summarize(N = n(), Team=first(Team), | |
S = sum(Swing), M = sum(Miss), | |
Swing_Rate = 100 * S / N, | |
Miss_Rate = 100 * M / S) -> SM | |
Names <- str_split(SM$player_name, " ", simplify=TRUE) | |
SM$Player <- Names[, 2] | |
# graph the swing and miss rates | |
TH <- theme(plot.title = | |
element_text(colour = "blue", size = 18, | |
hjust = 0.5, vjust = 0.8, angle = 0)) | |
library(ggrepel) | |
ggplot(SM, aes(Swing_Rate, Miss_Rate, | |
color=Team, label=Player)) + | |
geom_label_repel() + | |
xlab("Swing Rate") + ylab("Miss Rate") + | |
ggtitle("Swing and Miss Rates") + TH | |
# define the ggplot code to draw zone | |
topKzone <- 3.5 | |
botKzone <- 1.6 | |
inKzone <- -0.85 | |
outKzone <- 0.85 | |
kZone <- data.frame( | |
x=c(inKzone, inKzone, outKzone, outKzone, inKzone), | |
y=c(botKzone, topKzone, topKzone, botKzone, botKzone) | |
) | |
ZONE <- geom_path(aes(x, y), data=kZone, | |
lwd=2, col="red") | |
# function to fit a gam model where swing is the binary response | |
# and plate_x and plate_z is the location | |
# use model to estimate the probability of a swing over a 50 by 50 | |
# grid of pitch locations | |
swing_zone_data <- function(player, d){ | |
require(mgcv) | |
d1 <- filter(d, player_name==player) | |
fit <- gam(Swing ~ s(plate_x, plate_z), | |
family=binomial, | |
data=d1) | |
G <- expand.grid(plate_x = seq(-2, 2, length.out=50), | |
plate_z = seq(0, 4, length.out=50)) | |
lp <- predict(fit, G) | |
G$Prob <- exp(lp) / (1 + exp(lp)) | |
G$Player <- player | |
G$Team <- d1$Team[1] | |
G | |
} | |
# Fit this model for all players -- put all of the predictions | |
# in a single data frame | |
playerlist <- unique(d13$player_name) | |
gdata <- NULL | |
for (j in 1:length(playerlist)) | |
gdata <- rbind(gdata, swing_zone_data(playerlist[j], d13)) | |
# Here are the contour graphs -- I am removing specific players | |
# where I could not get reasonable estimates at the swing | |
# probabilities | |
ggplot(filter(gdata, Team=="Dodgers", | |
Player != "Logan Forsythe", | |
Player != "Charlie Culberson")) + | |
scale_colour_manual(values = c("navyblue","gray70")) + | |
stat_contour(aes(x=plate_x, y=plate_z, | |
z=Prob), | |
breaks=c(0.5), | |
size=1.5) + | |
xlim(-1.3, 1.3) + ylim(1, 4) + ZONE + | |
facet_wrap(~ Player, ncol=3) + | |
ggtitle("50% Swing Contours - Dodgers") + TH | |
ggplot(filter(gdata, Team=="Astros", | |
Player != "Brian McCann", | |
Player != "Marwin Gonzalez")) + | |
scale_colour_manual(values = c("navyblue","gray70")) + | |
stat_contour(aes(x=plate_x, y=plate_z, | |
z=Prob), | |
breaks=c(0.5), | |
size=1.5) + | |
xlim(-1.3, 1.3) + ylim(1, 4) + ZONE + | |
facet_wrap(~ Player, ncol=3) + | |
ggtitle("50% Swing Contours - Astros") + TH |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment