Skip to content

Instantly share code, notes, and snippets.

@bayesball
Last active November 25, 2019 13:05
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/27079301fcfa43cb337d1442e61ea469 to your computer and use it in GitHub Desktop.
Save bayesball/27079301fcfa43cb337d1442e61ea469 to your computer and use it in GitHub Desktop.
Multilevel modeling of on-base trajectories
# load some packages
library(tidyverse)
library(brms)
# source two files (both included below)
source("get_onbase_data.R")
source("compute_individual_regressions.R")
# a couple of utility functions
increasefont <- function(){
theme(text=element_text(size=18))
}
centertitle <- function(){
theme(plot.title = element_text(colour = "blue", size = 18,
hjust = 0.5, vjust = 0.8, angle = 0))
}
# extract data of interest
d78 <- get_onbase_data(1974, 1000)
# wrong thing to do
d78 %>% group_by(Age) %>%
summarize(P = sum(OB) / sum(PA)) -> Sout
ggplot(Sout, aes(Age, P)) +
geom_point() +
geom_smooth(method = 'loess') +
centertitle() +
increasefont() +
ggtitle("OBP by Age")
# actually data looks like this
ggplot(d78, aes(Age, OB / PA, color = nameLast)) +
geom_point() +
increasefont()
# compute individual quadratic fits
d78_b <- compute_individual_regressions(d78)
# graph Jeter's trajectory and fit
ggplot(filter(d78_b, nameLast == "Jeter"),
aes(Age, OB / PA)) +
geom_point(size = 2) +
geom_line(aes(Age, P), color = "red", size = 1.5) +
ggtitle("Derek Jeter's OBP Trajectory") +
increasefont() + centertitle()
# fit multilevel model
fit <- brm(OB | trials(PA) ~ AgeD + I(AgeD ^ 2) +
(AgeD + I(AgeD ^ 2) | Player),
data = filter(d78, PA > 0),
family = binomial("logit"))
# extract posterior means of regression coefficients for
# all players
Player_Fits <- coef(fit)$Player[, "Estimate", ] %>%
as.data.frame() %>% mutate(Player = 1:max(d78_b$Player))
# merge these estimates with our main dataset
d78_new <- inner_join(d78_b, Player_Fits, by = "Player")
# find estimates of OBP probs at each age
d78_new %>%
mutate(PH = plogis(Intercept + AgeD.y * AgeD.x +
IAgeDE2 * AgeD.x ^ 2)) -> d78_new
# graphs of individual fits
ggplot(d78_new, aes(Age, P, group = Player)) +
geom_line(color = "blue") +
ggtitle("Individual Trajectory Fits") +
centertitle() +
increasefont()
# graphs of individual and multilevel fits
ggplot(d78_new, aes(Age, P, group = Player)) +
geom_line(alpha = 0.3, color = "blue") +
geom_line(aes(Age, PH, group = Player), color = "red") +
ggtitle("Individual and Multilevel Fits") +
centertitle() +
increasefont()
# find peak ages using two models
d78_new %>% group_by(Player) %>%
summarize(B0 = first(B0),
B1 = first(B1),
B2 = first(B2),
b0 = first(Intercept),
b1 = first(AgeD.y),
b2 = first(IAgeDE2)) %>%
mutate(Ind_Peak_Age = 30 - B1 / 2 / B2,
MLM_Peak_Age = 30 - b1 / 2 / b2) %>%
select(Ind_Peak_Age, MLM_Peak_Age) %>%
pivot_longer(everything(),
names_to = "Type",
values_to = "Age") -> S
# graph two sets of peak ages
ggplot(S, aes(Type, Age)) +
geom_point() +
coord_flip() +
increasefont() +
ggtitle("Estimates of Peak Age") +
centertitle()
compute_individual_regressions <- function(d1){
library(tidyverse)
library(broom)
regressions <- d1 %>%
split(pull(., nameLast)) %>%
map(~ glm(cbind(OB, PA - OB) ~ AgeD +
I(AgeD ^ 2),
family = binomial, data = .)) %>%
map_df(tidy, .id = "Name") %>%
as_tibble() %>% select(Name, term, estimate)
# compute predicted probabilities for all seasons and players
regressions %>% spread(term, estimate) -> regs
names(regs) <- c("Name", "B0", "B1", "B2")
inner_join(d1, regs, by=c("nameLast"="Name")) -> d1
d1 %>%
mutate(LP = B0 + B1 * AgeD +
B2 * AgeD ^ 2,
P = exp(LP) / (1 + exp(LP))) -> d1
d1
}
get_onbase_data <- function(byear, TABB){
library(Lahman)
library(tidyverse)
# pull out players born in the specific year
Master %>% filter(birthYear %in% byear) %>%
select(playerID) %>% pull() -> player_list
# collect hitters with at least TABB at-bats
Batting %>% filter(playerID %in% player_list) %>%
group_by(playerID) %>%
summarize(AB = sum(AB)) -> S1
S1 %>%
filter(AB >= TABB) %>%
pull(playerID) -> player_list2000
# collect stats for all players for all seasons
Batting %>% filter(playerID %in% player_list2000) %>%
group_by(playerID, yearID) %>%
summarize(AB = sum(AB),
BB = sum(BB),
HBP = sum(HBP),
SF = sum(SF),
H = sum(H),
OB = H + BB + HBP,
PA = AB + BB + HBP + SF) -> d
# define age variable, and age deviation from 30
select(Master, playerID, birthYear, birthMonth) %>%
inner_join(d, by=c("playerID")) %>%
mutate(birthyear = ifelse(birthMonth >= 7,
birthYear + 1, birthYear),
Age = yearID - birthyear,
AgeD = Age - 30) %>%
select(playerID, Age, AgeD, PA, OB) -> d1
# add names to data frame and create numerical id for
# player using factors
d1 %>% inner_join(select(Master, playerID, nameFirst,
nameLast),
by = "playerID") %>%
mutate(Name = paste(nameFirst, nameLast)) %>%
select(nameLast, Age, AgeD, PA, OB) -> d1
d1$Player <- as.numeric(as.factor(d1$nameLast))
d1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment