Created
August 17, 2019 18:31
-
-
Save bayesball/8102d46218a4978f271f245cba728cb9 to your computer and use it in GitHub Desktop.
Illustrates the use of a GAM fit to predict home runs and check the predictions on a new dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load several packages | |
library(tidyverse) | |
library(CalledStrike) | |
library(mgcv) | |
# read in Statcast data for five seasons | |
sc <- read_csv("five_seasons_data.csv") | |
# create month variables | |
sc %>% mutate(month = substr(game_date, 6, 7)) -> sc | |
sc %>% | |
mutate(month = ifelse(month == "03", "04", month), | |
month = ifelse(month == "10", "09", month), | |
season = as.character(Season)) %>% | |
mutate(Month = as.numeric(month)) -> sc | |
## write a general function to do fitting and testing stages | |
fit_predict <- function(data1, data2, the_title = ""){ | |
fit <- gam(HR ~ s(launch_angle, launch_speed), | |
family = binomial, | |
data = data1) | |
sc_new <- data2 | |
actual <- sum(sc_new$events == "home_run", na.rm = TRUE) | |
prob <- predict(fit, sc_new, type = "response") | |
one_prediction <- function(Prob){ | |
sum(runif(length(prob)) < Prob) | |
} | |
many_predictions <- replicate(1000, one_prediction(prob)) | |
ggplot(data = data.frame(Prediction = | |
many_predictions), | |
aes(Prediction)) + | |
geom_histogram(color = "white", | |
fill = "tan", | |
bins = 15) + | |
increasefont() + | |
geom_vline(xintercept = actual, size = 2, | |
color = "red") + | |
ggtitle(the_title) + | |
centertitle() | |
} | |
# break 2018 data into two random parts | |
sc2018 <- filter(sc, season == 2018, | |
launch_angle > 10, | |
launch_speed > 80) | |
N <- nrow(sc2018) | |
ind <- sample(N, size = N / 2) | |
data1 <- sc2018[ind, ] | |
data2 <- sc2018[-ind, ] | |
fit_predict(data1, data2, | |
"Fit on Half of 2018, Predict on Other Half") | |
# modify second data part | |
# do random split but change LS and LA in test set | |
# 0.3 increase in LS, 0.4 increase in LA in 2019 | |
sc2018 <- filter(sc, season == 2018, | |
launch_angle > 10, | |
launch_speed > 80) | |
N <- nrow(sc2018) | |
ind <- sample(N, size = N / 2) | |
data1 <- sc2018[ind, ] | |
data2 <- sc2018[-ind, ] | |
data2$launch_angle <- data2$launch_angle + 0.4 | |
data2$launch_speed <- data2$launch_speed + 0.3 | |
fit_predict(data1, data2, | |
"Fit on Half of 2018, Predict on Adjusted Other Half") | |
# summaries for launch angles and launch speeds for 2018 and 2019 seasons | |
sc2018 <- filter(sc, season == 2018, | |
launch_angle > 10, | |
launch_speed > 80) | |
sc2019 <- filter(sc, season == 2019, | |
launch_angle > 10, | |
launch_speed > 80) | |
sc2018 %>% | |
summarize(N = n(), | |
LS = mean(launch_speed, na.rm = TRUE), | |
LS_sd = sd(launch_speed, na.rm = TRUE), | |
LA = mean(launch_angle, na.rm = TRUE), | |
LA_sd = sd(launch_angle, na.rm = TRUE)) -> | |
S18 | |
sc2019 %>% | |
summarize(N = n(), LS = mean(launch_speed, na.rm = TRUE), | |
LS_sd = sd(launch_speed, na.rm = TRUE), | |
LA = mean(launch_angle, na.rm = TRUE), | |
LA_sd = sd(launch_angle, na.rm = TRUE)) -> | |
S19 | |
S18 | |
S19 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment