Skip to content

Instantly share code, notes, and snippets.

@bayesball
Created October 25, 2018 01:51
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/aaf13fda751bf68d2ad2ee41ab044e71 to your computer and use it in GitHub Desktop.
Save bayesball/aaf13fda751bf68d2ad2ee41ab044e71 to your computer and use it in GitHub Desktop.
Exploring 2018 swing and miss rates
# Load in the necessary packages
library(tidyverse)
library(mgcv)
library(ggrepel)
library(gridExtra)
# Read in the statcast data for the 2018 season
d <- read_csv("../Statcastdata/statcast2018new.csv")
# Create Swing, Miss, Count, Count_Type variables
d %>%
mutate(Swing = ifelse(description %in%
c("foul", "foul_bunt", "foul_pitchout",
"foul_top", "hit_into_play",
"hit_into_play_no_out",
"hit_into_play_score", "missed_bunt",
"swinging_pitchout",
"swinging_strike",
"swinging_strike_blocked"), 1, 0),
Miss = ifelse(description %in%
c("swinging_pitchout",
"swinging_strike",
"swinging_strike_blocked"), 1, 0),
Count = paste(balls, strikes, sep="-"),
Count_Type = ifelse(Count %in% c("0-2", "1-2", "2-2"),
"pitcher",
ifelse(Count %in% c("2-0", "3-1", "3-0"),
"hitter", "neutral"))) -> d
# Find overall swing rate and miss rate
d %>% summarize(S = mean(Swing),
M = sum(Miss) / sum(Swing))
## Batter swing and miss rates
TH <- theme(plot.title = element_text(
colour = "blue", size = 18, hjust = 0.5))
d %>% group_by(player_name) %>%
summarize(N = n(), SWING = sum(Swing),
MISS = sum(Miss)) -> S7
library(ggrepel)
ggplot(filter(S7, N >=1000),
aes(100 * SWING / N, 100 * MISS / SWING)) +
geom_point() +
geom_smooth() +
geom_label_repel(data = filter(S7, N >= 1000,
player_name == "Mookie Betts" |
player_name == "Joey Gallo" |
MISS / SWING > .4 |
MISS / SWING < .1),
aes(label=player_name)) +
xlab("Swing Rate") +
ylab("Miss Rate") + TH +
ggtitle("Swing and Miss Rates of 2018 Regulars")
## 2018 World Series players swing and miss rates
Dodgers <- data.frame(Player = c("Austin Barnes", "Yasmani Grandal",
"Cody Bellinger", "Brian Dozier", "David Freese",
"Enrique Hernandez", "Manny Machado", "Max Muncy",
"Chris Taylor", "Justin Turner", "Matt Kemp",
"Joc Pederson", "Yasiel Puig"),
Team = rep("Dodgers", 13))
RedSox <- data.frame(Player = c("Sandy Leon", "Blake Swihart", "Christian Vazquez",
"Xander Bogaerts", "Rafael Devers", "Brock Holt", "Ian Kinsler",
"Mich Moreland", "Eduardo Nunez", "Steve Pearce",
"Andrew Benintendi", "Jackie Bradley Jr.", "Mookie Betts",
"J.D. Martinez"),
Team = rep("Red Sox", 14))
WS <- rbind(Dodgers, RedSox)
WSdata <- inner_join(S7, WS,
by = c("player_name" = "Player"))
ggplot(WSdata,
aes(100 * SWING / N, 100 * MISS / SWING)) +
geom_point() +
geom_label_repel(aes(label=player_name,
color = Team)) +
xlab("Swing Rate") +
ylab("Miss Rate") + TH +
ggtitle("Swing and Miss Rates of 2018 WS Players") +
scale_color_manual(values=c("blue", "red"))
########### pitch type effects
d %>% group_by(pitch_type) %>%
summarize(N = n(), SWING = sum(Swing),
MISS = sum(Miss)) -> S2
ggplot(filter(S2, N >= 10000),
aes(100 * SWING / N, 100 * MISS / SWING, label = pitch_type)) +
geom_label(color = "red", size = 5) +
xlab("Swing Rate") + ylab("Miss Rate") + TH +
ggtitle("Swing and Miss Rates for Different Pitch Types")
########### count effects
d %>% group_by(Count) %>%
summarize(N = n(), SWING = sum(Swing),
MISS = sum(Miss),
Count_Type = first(Count_Type)) -> S3
ggplot(S3,
aes(100 * SWING / N,
100 * MISS / SWING, label = Count,
color = Count_Type)) +
geom_label(size = 5) +
ylim(15, 28) +
xlim(0, 75) +
xlab("Swing Rate") + ylab("Miss Rate") + TH +
ggtitle("Swing and Miss Rates for Different Counts")
# want to use a heat map for swing rate, miss rate
# define the strike zone in a function
getzone <- function(){
topKzone <- 3.5
botKzone <- 1.6
inKzone <- -0.95
outKzone <- 0.95
data.frame(
x=c(inKzone, inKzone, outKzone, outKzone, inKzone),
y=c(botKzone, topKzone, topKzone, botKzone, botKzone))
}
# given the fit from a GAM, it finds the predicted
# probability over a grid of points
predict_zone <- function(fit){
data.predict <- expand.grid(
plate_x = seq(-1.5, 1.5, length.out=50),
plate_z = seq(0.5, 5, length.out=50))
data.predict %>% mutate(lp = predict(fit, newdata = .),
Probability = exp(lp) / (1 + exp(lp)))
}
# implement the GAM fit (logistic link) to the swing data
fit <- gam(Swing ~ s(plate_x, plate_z),
family=binomial, data=sample_n(d, size = 50000))
# plot
ggplot(getzone(), aes(x, y)) +
geom_tile(data=predict_zone(fit),
aes(x=plate_x, y=plate_z, fill= Probability)) +
scale_fill_distiller(palette = "Spectral") +
geom_path(lwd=1.5, col="black") +
coord_fixed() +
ggtitle("Heat Map of Probability of Swinging") + TH
########### model miss probability
fit <- gam(Miss ~ s(plate_x, plate_z),
family=binomial,
data=filter(d, Swing == 1))
ggplot(getzone(), aes(x, y)) +
geom_tile(data=predict_zone(fit),
aes(x=plate_x, y=plate_z, fill= Probability)) +
scale_fill_distiller(palette = "Spectral") +
geom_path(lwd=1.5, col="black") +
coord_fixed() + TH +
ggtitle("Heat Map of Probability of Missing")
### left/right breakdown on miss rate
TH <- theme(plot.title = element_text(
colour = "blue", size = 14, hjust = 0.5))
fit <- gam(Miss ~ s(plate_x, plate_z),
family=binomial,
data=filter(d, Swing == 1, stand == "L"))
p1 <- ggplot(getzone(), aes(x, y)) +
geom_tile(data=predict_zone(fit),
aes(x=plate_x, y=plate_z, fill= Probability)) +
scale_fill_distiller(palette = "Spectral") +
geom_path(lwd=1.5, col="black") +
coord_fixed() + TH +
ggtitle("Probability of Missing - Left")
fit <- gam(Miss ~ s(plate_x, plate_z),
family=binomial,
data=filter(d, Swing == 1, stand == "R"))
p2 <- ggplot(getzone(), aes(x, y)) +
geom_tile(data=predict_zone(fit),
aes(x=plate_x, y=plate_z, fill= Probability)) +
scale_fill_distiller(palette = "Spectral") +
geom_path(lwd=1.5, col="black") +
coord_fixed() + TH +
ggtitle("Probability of Missing - Right")
grid.arrange(p1, p2, nrow = 1)
## show heat maps of miss rates for two players
fit <- gam(Miss ~ s(plate_x, plate_z),
family=binomial,
data=filter(d, Swing == 1,
player_name == "Mookie Betts"))
p3 <- ggplot(getzone(), aes(x, y)) +
geom_tile(data=predict_zone(fit),
aes(x=plate_x, y=plate_z, fill= Probability)) +
scale_fill_distiller(palette = "Spectral") +
geom_path(lwd=1.5, col="black") +
coord_fixed() + TH +
ggtitle("P(Missing) - Mookie Betts")
fit <- gam(Miss ~ s(plate_x, plate_z),
family=binomial,
data=filter(d, Swing == 1,
player_name == "Max Muncy"))
p4 <- ggplot(getzone(), aes(x, y)) +
geom_tile(data=predict_zone(fit),
aes(x=plate_x, y=plate_z, fill= Probability)) +
scale_fill_distiller(palette = "Spectral") +
geom_path(lwd=1.5, col="black") +
coord_fixed() + TH +
ggtitle("P(Missing) - Max Muncy")
grid.arrange(p3, p4, nrow = 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment