Created
October 25, 2018 01:51
-
-
Save bayesball/aaf13fda751bf68d2ad2ee41ab044e71 to your computer and use it in GitHub Desktop.
Exploring 2018 swing and miss rates
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load in the necessary packages | |
library(tidyverse) | |
library(mgcv) | |
library(ggrepel) | |
library(gridExtra) | |
# Read in the statcast data for the 2018 season | |
d <- read_csv("../Statcastdata/statcast2018new.csv") | |
# Create Swing, Miss, Count, Count_Type variables | |
d %>% | |
mutate(Swing = ifelse(description %in% | |
c("foul", "foul_bunt", "foul_pitchout", | |
"foul_top", "hit_into_play", | |
"hit_into_play_no_out", | |
"hit_into_play_score", "missed_bunt", | |
"swinging_pitchout", | |
"swinging_strike", | |
"swinging_strike_blocked"), 1, 0), | |
Miss = ifelse(description %in% | |
c("swinging_pitchout", | |
"swinging_strike", | |
"swinging_strike_blocked"), 1, 0), | |
Count = paste(balls, strikes, sep="-"), | |
Count_Type = ifelse(Count %in% c("0-2", "1-2", "2-2"), | |
"pitcher", | |
ifelse(Count %in% c("2-0", "3-1", "3-0"), | |
"hitter", "neutral"))) -> d | |
# Find overall swing rate and miss rate | |
d %>% summarize(S = mean(Swing), | |
M = sum(Miss) / sum(Swing)) | |
## Batter swing and miss rates | |
TH <- theme(plot.title = element_text( | |
colour = "blue", size = 18, hjust = 0.5)) | |
d %>% group_by(player_name) %>% | |
summarize(N = n(), SWING = sum(Swing), | |
MISS = sum(Miss)) -> S7 | |
library(ggrepel) | |
ggplot(filter(S7, N >=1000), | |
aes(100 * SWING / N, 100 * MISS / SWING)) + | |
geom_point() + | |
geom_smooth() + | |
geom_label_repel(data = filter(S7, N >= 1000, | |
player_name == "Mookie Betts" | | |
player_name == "Joey Gallo" | | |
MISS / SWING > .4 | | |
MISS / SWING < .1), | |
aes(label=player_name)) + | |
xlab("Swing Rate") + | |
ylab("Miss Rate") + TH + | |
ggtitle("Swing and Miss Rates of 2018 Regulars") | |
## 2018 World Series players swing and miss rates | |
Dodgers <- data.frame(Player = c("Austin Barnes", "Yasmani Grandal", | |
"Cody Bellinger", "Brian Dozier", "David Freese", | |
"Enrique Hernandez", "Manny Machado", "Max Muncy", | |
"Chris Taylor", "Justin Turner", "Matt Kemp", | |
"Joc Pederson", "Yasiel Puig"), | |
Team = rep("Dodgers", 13)) | |
RedSox <- data.frame(Player = c("Sandy Leon", "Blake Swihart", "Christian Vazquez", | |
"Xander Bogaerts", "Rafael Devers", "Brock Holt", "Ian Kinsler", | |
"Mich Moreland", "Eduardo Nunez", "Steve Pearce", | |
"Andrew Benintendi", "Jackie Bradley Jr.", "Mookie Betts", | |
"J.D. Martinez"), | |
Team = rep("Red Sox", 14)) | |
WS <- rbind(Dodgers, RedSox) | |
WSdata <- inner_join(S7, WS, | |
by = c("player_name" = "Player")) | |
ggplot(WSdata, | |
aes(100 * SWING / N, 100 * MISS / SWING)) + | |
geom_point() + | |
geom_label_repel(aes(label=player_name, | |
color = Team)) + | |
xlab("Swing Rate") + | |
ylab("Miss Rate") + TH + | |
ggtitle("Swing and Miss Rates of 2018 WS Players") + | |
scale_color_manual(values=c("blue", "red")) | |
########### pitch type effects | |
d %>% group_by(pitch_type) %>% | |
summarize(N = n(), SWING = sum(Swing), | |
MISS = sum(Miss)) -> S2 | |
ggplot(filter(S2, N >= 10000), | |
aes(100 * SWING / N, 100 * MISS / SWING, label = pitch_type)) + | |
geom_label(color = "red", size = 5) + | |
xlab("Swing Rate") + ylab("Miss Rate") + TH + | |
ggtitle("Swing and Miss Rates for Different Pitch Types") | |
########### count effects | |
d %>% group_by(Count) %>% | |
summarize(N = n(), SWING = sum(Swing), | |
MISS = sum(Miss), | |
Count_Type = first(Count_Type)) -> S3 | |
ggplot(S3, | |
aes(100 * SWING / N, | |
100 * MISS / SWING, label = Count, | |
color = Count_Type)) + | |
geom_label(size = 5) + | |
ylim(15, 28) + | |
xlim(0, 75) + | |
xlab("Swing Rate") + ylab("Miss Rate") + TH + | |
ggtitle("Swing and Miss Rates for Different Counts") | |
# want to use a heat map for swing rate, miss rate | |
# define the strike zone in a function | |
getzone <- function(){ | |
topKzone <- 3.5 | |
botKzone <- 1.6 | |
inKzone <- -0.95 | |
outKzone <- 0.95 | |
data.frame( | |
x=c(inKzone, inKzone, outKzone, outKzone, inKzone), | |
y=c(botKzone, topKzone, topKzone, botKzone, botKzone)) | |
} | |
# given the fit from a GAM, it finds the predicted | |
# probability over a grid of points | |
predict_zone <- function(fit){ | |
data.predict <- expand.grid( | |
plate_x = seq(-1.5, 1.5, length.out=50), | |
plate_z = seq(0.5, 5, length.out=50)) | |
data.predict %>% mutate(lp = predict(fit, newdata = .), | |
Probability = exp(lp) / (1 + exp(lp))) | |
} | |
# implement the GAM fit (logistic link) to the swing data | |
fit <- gam(Swing ~ s(plate_x, plate_z), | |
family=binomial, data=sample_n(d, size = 50000)) | |
# plot | |
ggplot(getzone(), aes(x, y)) + | |
geom_tile(data=predict_zone(fit), | |
aes(x=plate_x, y=plate_z, fill= Probability)) + | |
scale_fill_distiller(palette = "Spectral") + | |
geom_path(lwd=1.5, col="black") + | |
coord_fixed() + | |
ggtitle("Heat Map of Probability of Swinging") + TH | |
########### model miss probability | |
fit <- gam(Miss ~ s(plate_x, plate_z), | |
family=binomial, | |
data=filter(d, Swing == 1)) | |
ggplot(getzone(), aes(x, y)) + | |
geom_tile(data=predict_zone(fit), | |
aes(x=plate_x, y=plate_z, fill= Probability)) + | |
scale_fill_distiller(palette = "Spectral") + | |
geom_path(lwd=1.5, col="black") + | |
coord_fixed() + TH + | |
ggtitle("Heat Map of Probability of Missing") | |
### left/right breakdown on miss rate | |
TH <- theme(plot.title = element_text( | |
colour = "blue", size = 14, hjust = 0.5)) | |
fit <- gam(Miss ~ s(plate_x, plate_z), | |
family=binomial, | |
data=filter(d, Swing == 1, stand == "L")) | |
p1 <- ggplot(getzone(), aes(x, y)) + | |
geom_tile(data=predict_zone(fit), | |
aes(x=plate_x, y=plate_z, fill= Probability)) + | |
scale_fill_distiller(palette = "Spectral") + | |
geom_path(lwd=1.5, col="black") + | |
coord_fixed() + TH + | |
ggtitle("Probability of Missing - Left") | |
fit <- gam(Miss ~ s(plate_x, plate_z), | |
family=binomial, | |
data=filter(d, Swing == 1, stand == "R")) | |
p2 <- ggplot(getzone(), aes(x, y)) + | |
geom_tile(data=predict_zone(fit), | |
aes(x=plate_x, y=plate_z, fill= Probability)) + | |
scale_fill_distiller(palette = "Spectral") + | |
geom_path(lwd=1.5, col="black") + | |
coord_fixed() + TH + | |
ggtitle("Probability of Missing - Right") | |
grid.arrange(p1, p2, nrow = 1) | |
## show heat maps of miss rates for two players | |
fit <- gam(Miss ~ s(plate_x, plate_z), | |
family=binomial, | |
data=filter(d, Swing == 1, | |
player_name == "Mookie Betts")) | |
p3 <- ggplot(getzone(), aes(x, y)) + | |
geom_tile(data=predict_zone(fit), | |
aes(x=plate_x, y=plate_z, fill= Probability)) + | |
scale_fill_distiller(palette = "Spectral") + | |
geom_path(lwd=1.5, col="black") + | |
coord_fixed() + TH + | |
ggtitle("P(Missing) - Mookie Betts") | |
fit <- gam(Miss ~ s(plate_x, plate_z), | |
family=binomial, | |
data=filter(d, Swing == 1, | |
player_name == "Max Muncy")) | |
p4 <- ggplot(getzone(), aes(x, y)) + | |
geom_tile(data=predict_zone(fit), | |
aes(x=plate_x, y=plate_z, fill= Probability)) + | |
scale_fill_distiller(palette = "Spectral") + | |
geom_path(lwd=1.5, col="black") + | |
coord_fixed() + TH + | |
ggtitle("P(Missing) - Max Muncy") | |
grid.arrange(p3, p4, nrow = 1) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment