Last active
September 26, 2022 00:43
-
-
Save bayesball/4a7d75314b75ac91121885567e9b5b9a to your computer and use it in GitHub Desktop.
R work for the Predicting Home Runs using a Multilevel Model post
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
get_hr_data <- function(pred_season, | |
retro_data, | |
n_prev_seasons = 4, | |
mPA = 1000, | |
mPA_season = 200){ | |
# n_prev_seasons is number of previous seasons | |
# mPA is the minimum number of cumulative PA | |
# retrodata - Retrosheet data for current season | |
# mPA_season - minimum number of PA in both | |
# halves of current season | |
require(lubridate) | |
require(Lahman) | |
require(dplyr) | |
get_players <- function(seasons, mPA){ | |
Batting %>% | |
filter(yearID %in% seasons) %>% | |
group_by(playerID, yearID) %>% | |
summarize(PA = sum(AB + BB + HBP), | |
HR = sum(HR), | |
.groups = "drop") -> S | |
S %>% | |
group_by(playerID) %>% | |
summarize(HR = sum(HR), | |
PA = sum(PA)) %>% | |
filter(PA >= mPA) %>% | |
inner_join(People, by = "playerID") %>% | |
select(retroID, HR, PA) | |
} | |
seasons <- (pred_season - n_prev_seasons): | |
(pred_season - 1) | |
get_players(seasons, mPA) -> players1000 | |
retro_data %>% | |
filter(BAT_EVENT_FL == TRUE) %>% | |
mutate(Year = substr(GAME_ID, 4, 7), | |
MonthDay = substr(GAME_ID, 8, 11), | |
Date = mdy(paste(MonthDay, Year, sep = ""))) -> | |
retro_data | |
midseason <- mdy(paste("07-01-", | |
retro_data$Year[1], | |
sep = "")) | |
da <- filter(retro_data, Date < midseason) | |
db <- filter(retro_data, Date >= midseason) | |
da %>% | |
group_by(BAT_ID) %>% | |
summarize(PA = n(), | |
HR = sum(EVENT_CD == 23)) %>% | |
mutate(Period = "First Half") -> Sa | |
db %>% | |
group_by(BAT_ID) %>% | |
summarize(PA = n(), | |
HR = sum(EVENT_CD == 23)) %>% | |
mutate(Period = "Second Half") -> Sb | |
Sab <- inner_join(Sa, Sb, by = "BAT_ID") %>% | |
filter(PA.x >= mPA_season, | |
PA.y >= mPA_season) | |
inner_join(Sab, players1000, | |
by = c("BAT_ID" = "retroID")) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
multilevel_reg <- function(d1){ | |
library(tidyverse) | |
library(runjags) | |
library(coda) | |
# multilevel modeling part | |
modelString = " | |
model { | |
for (i in 1:J){ | |
y[i] ~ dbin(p[i], n[i]) | |
logit(p[i]) <- a[player[i]] + b[player[i]] * x[i] | |
} | |
for (j in 1:J){ | |
a[j] <- B[j,1] | |
b[j] <- B[j,2] | |
B[j,1:2] ~ dmnorm (mu.beta[], Tau.B[,]) | |
} | |
mu.beta[1:2] ~ dmnorm(mean[1:2],prec[1:2 ,1:2]) | |
Tau.B[1:2 , 1:2] ~ dwish(Omega[1:2 ,1:2 ], 2) | |
} | |
" | |
# data for JAGS | |
logit <- function(y){ log(y) - log(1 - y)} | |
b.data <- list(y = d1$HR.x, | |
n = d1$PA.x, | |
player = d1$Player, | |
x = logit(d1$HR / d1$PA), | |
J = dim(d1)[1], | |
mean = c(0, 0), | |
Omega = diag(c(.1, .1)), | |
prec = diag(c(1.0E-6, 1.0E-6))) | |
# initialization function | |
initsfunction <- function(chain){ | |
.RNG.seed <- c(1,2)[chain] | |
.RNG.name <- c("base::Super-Duper", | |
"base::Wichmann-Hill")[chain] | |
return(list(.RNG.seed=.RNG.seed, | |
.RNG.name=.RNG.name)) | |
} | |
# implement MCMC | |
posterior <- run.jags(modelString, | |
n.chains = 1, | |
data = b.data, | |
monitor = c("B", "mu.beta"), | |
adapt = 1000, | |
burnin = 5000, | |
sample = 20000, | |
inits = initsfunction) | |
# convert simulated draws to a data frame | |
post <- as.mcmc(posterior) | |
post <- as.data.frame(post) | |
# compute posterior means, use these to get estimated | |
# probabilities and add those to the original data frame | |
est <- apply(post, 2, mean)[1:(2 * b.data$J)] | |
est <- as.data.frame(matrix(est, b.data$J, 2, | |
byrow = FALSE)) | |
names(est) <- c("b0", "b1") | |
est$Player <- 1:b.data$J | |
d1 <- inner_join(d1, est, by = "Player") | |
d1 %>% mutate(lp = b0 + b1 * logit(HR / PA), | |
p = exp(lp) / (1 + exp(lp))) | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
run_and_evaluate <- function(season, retrodata){ | |
library(dplyr) | |
d <- get_hr_data(season, retrodata) | |
d %>% filter(HR.x > 0, | |
HR > 0) -> d | |
d$Player <- 1:dim(d)[1] | |
# multilevel fit | |
d <- multilevel_reg(d) | |
# compare prediction errors | |
d %>% | |
mutate(Season = season, | |
Rate.x = HR.x / PA.x, | |
Rate.y = HR.y / PA.y, | |
Rate.old = HR / PA, | |
Rate.20 = .20 * Rate.old + .80 * Rate.x, | |
Rate.50 = .50 * Rate.old + .50 * Rate.x, | |
Rate.80 = .80 * Rate.old + .20 * Rate.x) -> d | |
d %>% | |
summarize(N = n(), | |
Season = first(Season), | |
Current = mean(abs(Rate.x - Rate.y)), | |
Old = mean(abs(Rate.old - Rate.y)), | |
Mix.20 = mean(abs(Rate.20 - Rate.y)), | |
Mix.50 = mean(abs(Rate.50 - Rate.y)), | |
Mix.80 = mean(abs(Rate.80 - Rate.y)), | |
Bayes = mean(abs(p - Rate.y))) %>% | |
select(Season, N, Current, Old, | |
Mix.20, Mix.50, Mix.80, Bayes) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment