Skip to content

Instantly share code, notes, and snippets.

@bayesball
Created October 22, 2017 21:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/0a1baaf8054253ea5097d206d7c8d0ec to your computer and use it in GitHub Desktop.
Save bayesball/0a1baaf8054253ea5097d206d7c8d0ec to your computer and use it in GitHub Desktop.
R code to explore swing and miss rates of 2017 World Series teams
# Read in pitch-by-pitch data for
# all Dodgers and Astros regulars in 2017 season
# data is downloaded into csv files that I read into R
library(tidyverse)
d1 <- read_csv("astros.csv")
d3 <- read_csv("dodgers.csv")
d1$Team <- "Astros"
d3$Team <- "Dodgers"
d13 <- rbind(d1, d3)
# Define swing and miss binary (0 and 1) variables
d13 %>% mutate(Swing=ifelse(description %in%
c("foul", "foul_tip", "hit_in_play",
"hit_into_play_no_out",
"hit_into_play_score",
"swinging_strike",
"swinging_strike_blocked"),
1, 0),
Miss=ifelse(description %in%
c("swinging_strike",
"swinging_strike_blocked"),
1, 0)) -> d13
# Compute swing and miss rates for all players
library(stringr)
d13 %>% group_by(player_name) %>%
summarize(N = n(), Team=first(Team),
S = sum(Swing), M = sum(Miss),
Swing_Rate = 100 * S / N,
Miss_Rate = 100 * M / S) -> SM
Names <- str_split(SM$player_name, " ", simplify=TRUE)
SM$Player <- Names[, 2]
# graph the swing and miss rates
TH <- theme(plot.title =
element_text(colour = "blue", size = 18,
hjust = 0.5, vjust = 0.8, angle = 0))
library(ggrepel)
ggplot(SM, aes(Swing_Rate, Miss_Rate,
color=Team, label=Player)) +
geom_label_repel() +
xlab("Swing Rate") + ylab("Miss Rate") +
ggtitle("Swing and Miss Rates") + TH
# define the ggplot code to draw zone
topKzone <- 3.5
botKzone <- 1.6
inKzone <- -0.85
outKzone <- 0.85
kZone <- data.frame(
x=c(inKzone, inKzone, outKzone, outKzone, inKzone),
y=c(botKzone, topKzone, topKzone, botKzone, botKzone)
)
ZONE <- geom_path(aes(x, y), data=kZone,
lwd=2, col="red")
# function to fit a gam model where swing is the binary response
# and plate_x and plate_z is the location
# use model to estimate the probability of a swing over a 50 by 50
# grid of pitch locations
swing_zone_data <- function(player, d){
require(mgcv)
d1 <- filter(d, player_name==player)
fit <- gam(Swing ~ s(plate_x, plate_z),
family=binomial,
data=d1)
G <- expand.grid(plate_x = seq(-2, 2, length.out=50),
plate_z = seq(0, 4, length.out=50))
lp <- predict(fit, G)
G$Prob <- exp(lp) / (1 + exp(lp))
G$Player <- player
G$Team <- d1$Team[1]
G
}
# Fit this model for all players -- put all of the predictions
# in a single data frame
playerlist <- unique(d13$player_name)
gdata <- NULL
for (j in 1:length(playerlist))
gdata <- rbind(gdata, swing_zone_data(playerlist[j], d13))
# Here are the contour graphs -- I am removing specific players
# where I could not get reasonable estimates at the swing
# probabilities
ggplot(filter(gdata, Team=="Dodgers",
Player != "Logan Forsythe",
Player != "Charlie Culberson")) +
scale_colour_manual(values = c("navyblue","gray70")) +
stat_contour(aes(x=plate_x, y=plate_z,
z=Prob),
breaks=c(0.5),
size=1.5) +
xlim(-1.3, 1.3) + ylim(1, 4) + ZONE +
facet_wrap(~ Player, ncol=3) +
ggtitle("50% Swing Contours - Dodgers") + TH
ggplot(filter(gdata, Team=="Astros",
Player != "Brian McCann",
Player != "Marwin Gonzalez")) +
scale_colour_manual(values = c("navyblue","gray70")) +
stat_contour(aes(x=plate_x, y=plate_z,
z=Prob),
breaks=c(0.5),
size=1.5) +
xlim(-1.3, 1.3) + ylim(1, 4) + ZONE +
facet_wrap(~ Player, ncol=3) +
ggtitle("50% Swing Contours - Astros") + TH
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment