Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
R code to fit generalized additive model to Statcast data
# load in packages
library(readr)
library(dplyr)
library(ggplot2)
library(mgcv)
##### read in a theme for the title of my plots
TH <- theme(plot.title = element_text(hjust = 0.5, size = 18))
##### read in data
##### I use the scrape_statcast_savant_batter_all function in
##### the baseballr data to collect the data for the 2017 season
sc_all <- read_csv("statcast2017.csv")
sc_ip <- filter(sc_all, type == "X")
sc_ip %>% mutate(outcome = ifelse(events %in%
c("single", "double", "triple", "home_run"), 1, 0)) -> sc_ip
##### basic exploratory plot
ggplot(slice(sc_ip, 1:2000),
aes(launch_angle, launch_speed, color=outcome)) +
geom_point() +
xlim(-25, 50) +
ylim(40, 120) +
ggtitle("2000 Balls Put In Play") + TH
##### some modeling
##### work with a sample of 60,000
sc_ip_sample <- slice(sc_ip, 1:60000)
sc_ip_test <- slice(sc_ip, 60001:129365)
##### fit gam model
fit <- gam(outcome ~ s(launch_angle, launch_speed),
family = binomial,
data = sc_ip_sample)
##### convert the predictions to the probability scale
invlogit <- function(x) exp(x) / (1 + exp(x))
##### what is the probability of a hit when the
##### launch angle is 20 degrees and the exit velocitiy is 100?
invlogit(predict(fit, data.frame(launch_angle = 20,
launch_speed = 100)))
##### predict over grid
df_p <- expand.grid(launch_angle = seq(-20, 50, length=50),
launch_speed = seq(40, 120, length=50))
df_p$lp <- predict(fit, df_p)
df_p$Probability <- exp(df_p$lp) / (1 + exp(df_p$lp))
# change colors and use finer grid
ggplot(df_p, aes(x=launch_angle, y=launch_speed,
z=Probability)) +
stat_contour(geom="polygon",
breaks=seq(0, 1, length.out = 21),
size=1.5,
aes(fill=..level..)) +
scale_fill_gradientn(colours = heat.colors(10)) +
geom_vline(xintercept = 0, color="black") +
xlim(-25, 50) +
ylim(40, 120) +
ggtitle("Contour Plot of Probability of Hit") + TH
############### evaluate several prediction rules
sc_ip_test$lp <- predict(fit, sc_ip_test)
sc_ip_test$predict <- ifelse(sc_ip_test$lp > 0, 1, 0)
sc_ip_test %>%
summarize(Rate = mean(outcome == predict, na.rm=TRUE))
# compare to other rules
sc_ip_test %>% summarize(mean(outcome, na.rm = TRUE))
sc_ip_test$rule1 <- ifelse(sc_ip_test$launch_speed > 100,
1, 0)
sc_ip_test %>%
summarize(Rate1 = mean(outcome == rule1, na.rm=TRUE))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.