Skip to content

Instantly share code, notes, and snippets.

@bayesball
Last active September 26, 2022 00:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/4a7d75314b75ac91121885567e9b5b9a to your computer and use it in GitHub Desktop.
Save bayesball/4a7d75314b75ac91121885567e9b5b9a to your computer and use it in GitHub Desktop.
R work for the Predicting Home Runs using a Multilevel Model post
get_hr_data <- function(pred_season,
retro_data,
n_prev_seasons = 4,
mPA = 1000,
mPA_season = 200){
# n_prev_seasons is number of previous seasons
# mPA is the minimum number of cumulative PA
# retrodata - Retrosheet data for current season
# mPA_season - minimum number of PA in both
# halves of current season
require(lubridate)
require(Lahman)
require(dplyr)
get_players <- function(seasons, mPA){
Batting %>%
filter(yearID %in% seasons) %>%
group_by(playerID, yearID) %>%
summarize(PA = sum(AB + BB + HBP),
HR = sum(HR),
.groups = "drop") -> S
S %>%
group_by(playerID) %>%
summarize(HR = sum(HR),
PA = sum(PA)) %>%
filter(PA >= mPA) %>%
inner_join(People, by = "playerID") %>%
select(retroID, HR, PA)
}
seasons <- (pred_season - n_prev_seasons):
(pred_season - 1)
get_players(seasons, mPA) -> players1000
retro_data %>%
filter(BAT_EVENT_FL == TRUE) %>%
mutate(Year = substr(GAME_ID, 4, 7),
MonthDay = substr(GAME_ID, 8, 11),
Date = mdy(paste(MonthDay, Year, sep = ""))) ->
retro_data
midseason <- mdy(paste("07-01-",
retro_data$Year[1],
sep = ""))
da <- filter(retro_data, Date < midseason)
db <- filter(retro_data, Date >= midseason)
da %>%
group_by(BAT_ID) %>%
summarize(PA = n(),
HR = sum(EVENT_CD == 23)) %>%
mutate(Period = "First Half") -> Sa
db %>%
group_by(BAT_ID) %>%
summarize(PA = n(),
HR = sum(EVENT_CD == 23)) %>%
mutate(Period = "Second Half") -> Sb
Sab <- inner_join(Sa, Sb, by = "BAT_ID") %>%
filter(PA.x >= mPA_season,
PA.y >= mPA_season)
inner_join(Sab, players1000,
by = c("BAT_ID" = "retroID"))
}
multilevel_reg <- function(d1){
library(tidyverse)
library(runjags)
library(coda)
# multilevel modeling part
modelString = "
model {
for (i in 1:J){
y[i] ~ dbin(p[i], n[i])
logit(p[i]) <- a[player[i]] + b[player[i]] * x[i]
}
for (j in 1:J){
a[j] <- B[j,1]
b[j] <- B[j,2]
B[j,1:2] ~ dmnorm (mu.beta[], Tau.B[,])
}
mu.beta[1:2] ~ dmnorm(mean[1:2],prec[1:2 ,1:2])
Tau.B[1:2 , 1:2] ~ dwish(Omega[1:2 ,1:2 ], 2)
}
"
# data for JAGS
logit <- function(y){ log(y) - log(1 - y)}
b.data <- list(y = d1$HR.x,
n = d1$PA.x,
player = d1$Player,
x = logit(d1$HR / d1$PA),
J = dim(d1)[1],
mean = c(0, 0),
Omega = diag(c(.1, .1)),
prec = diag(c(1.0E-6, 1.0E-6)))
# initialization function
initsfunction <- function(chain){
.RNG.seed <- c(1,2)[chain]
.RNG.name <- c("base::Super-Duper",
"base::Wichmann-Hill")[chain]
return(list(.RNG.seed=.RNG.seed,
.RNG.name=.RNG.name))
}
# implement MCMC
posterior <- run.jags(modelString,
n.chains = 1,
data = b.data,
monitor = c("B", "mu.beta"),
adapt = 1000,
burnin = 5000,
sample = 20000,
inits = initsfunction)
# convert simulated draws to a data frame
post <- as.mcmc(posterior)
post <- as.data.frame(post)
# compute posterior means, use these to get estimated
# probabilities and add those to the original data frame
est <- apply(post, 2, mean)[1:(2 * b.data$J)]
est <- as.data.frame(matrix(est, b.data$J, 2,
byrow = FALSE))
names(est) <- c("b0", "b1")
est$Player <- 1:b.data$J
d1 <- inner_join(d1, est, by = "Player")
d1 %>% mutate(lp = b0 + b1 * logit(HR / PA),
p = exp(lp) / (1 + exp(lp)))
}
run_and_evaluate <- function(season, retrodata){
library(dplyr)
d <- get_hr_data(season, retrodata)
d %>% filter(HR.x > 0,
HR > 0) -> d
d$Player <- 1:dim(d)[1]
# multilevel fit
d <- multilevel_reg(d)
# compare prediction errors
d %>%
mutate(Season = season,
Rate.x = HR.x / PA.x,
Rate.y = HR.y / PA.y,
Rate.old = HR / PA,
Rate.20 = .20 * Rate.old + .80 * Rate.x,
Rate.50 = .50 * Rate.old + .50 * Rate.x,
Rate.80 = .80 * Rate.old + .20 * Rate.x) -> d
d %>%
summarize(N = n(),
Season = first(Season),
Current = mean(abs(Rate.x - Rate.y)),
Old = mean(abs(Rate.old - Rate.y)),
Mix.20 = mean(abs(Rate.20 - Rate.y)),
Mix.50 = mean(abs(Rate.50 - Rate.y)),
Mix.80 = mean(abs(Rate.80 - Rate.y)),
Bayes = mean(abs(p - Rate.y))) %>%
select(Season, N, Current, Old,
Mix.20, Mix.50, Mix.80, Bayes)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment