Skip to content

Instantly share code, notes, and snippets.

@bayesball
Created August 17, 2019 18:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/8102d46218a4978f271f245cba728cb9 to your computer and use it in GitHub Desktop.
Save bayesball/8102d46218a4978f271f245cba728cb9 to your computer and use it in GitHub Desktop.
Illustrates the use of a GAM fit to predict home runs and check the predictions on a new dataset
# load several packages
library(tidyverse)
library(CalledStrike)
library(mgcv)
# read in Statcast data for five seasons
sc <- read_csv("five_seasons_data.csv")
# create month variables
sc %>% mutate(month = substr(game_date, 6, 7)) -> sc
sc %>%
mutate(month = ifelse(month == "03", "04", month),
month = ifelse(month == "10", "09", month),
season = as.character(Season)) %>%
mutate(Month = as.numeric(month)) -> sc
## write a general function to do fitting and testing stages
fit_predict <- function(data1, data2, the_title = ""){
fit <- gam(HR ~ s(launch_angle, launch_speed),
family = binomial,
data = data1)
sc_new <- data2
actual <- sum(sc_new$events == "home_run", na.rm = TRUE)
prob <- predict(fit, sc_new, type = "response")
one_prediction <- function(Prob){
sum(runif(length(prob)) < Prob)
}
many_predictions <- replicate(1000, one_prediction(prob))
ggplot(data = data.frame(Prediction =
many_predictions),
aes(Prediction)) +
geom_histogram(color = "white",
fill = "tan",
bins = 15) +
increasefont() +
geom_vline(xintercept = actual, size = 2,
color = "red") +
ggtitle(the_title) +
centertitle()
}
# break 2018 data into two random parts
sc2018 <- filter(sc, season == 2018,
launch_angle > 10,
launch_speed > 80)
N <- nrow(sc2018)
ind <- sample(N, size = N / 2)
data1 <- sc2018[ind, ]
data2 <- sc2018[-ind, ]
fit_predict(data1, data2,
"Fit on Half of 2018, Predict on Other Half")
# modify second data part
# do random split but change LS and LA in test set
# 0.3 increase in LS, 0.4 increase in LA in 2019
sc2018 <- filter(sc, season == 2018,
launch_angle > 10,
launch_speed > 80)
N <- nrow(sc2018)
ind <- sample(N, size = N / 2)
data1 <- sc2018[ind, ]
data2 <- sc2018[-ind, ]
data2$launch_angle <- data2$launch_angle + 0.4
data2$launch_speed <- data2$launch_speed + 0.3
fit_predict(data1, data2,
"Fit on Half of 2018, Predict on Adjusted Other Half")
# summaries for launch angles and launch speeds for 2018 and 2019 seasons
sc2018 <- filter(sc, season == 2018,
launch_angle > 10,
launch_speed > 80)
sc2019 <- filter(sc, season == 2019,
launch_angle > 10,
launch_speed > 80)
sc2018 %>%
summarize(N = n(),
LS = mean(launch_speed, na.rm = TRUE),
LS_sd = sd(launch_speed, na.rm = TRUE),
LA = mean(launch_angle, na.rm = TRUE),
LA_sd = sd(launch_angle, na.rm = TRUE)) ->
S18
sc2019 %>%
summarize(N = n(), LS = mean(launch_speed, na.rm = TRUE),
LS_sd = sd(launch_speed, na.rm = TRUE),
LA = mean(launch_angle, na.rm = TRUE),
LA_sd = sd(launch_angle, na.rm = TRUE)) ->
S19
S18
S19
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment