Last active
November 25, 2019 13:05
-
-
Save bayesball/27079301fcfa43cb337d1442e61ea469 to your computer and use it in GitHub Desktop.
Multilevel modeling of on-base trajectories
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load some packages | |
library(tidyverse) | |
library(brms) | |
# source two files (both included below) | |
source("get_onbase_data.R") | |
source("compute_individual_regressions.R") | |
# a couple of utility functions | |
increasefont <- function(){ | |
theme(text=element_text(size=18)) | |
} | |
centertitle <- function(){ | |
theme(plot.title = element_text(colour = "blue", size = 18, | |
hjust = 0.5, vjust = 0.8, angle = 0)) | |
} | |
# extract data of interest | |
d78 <- get_onbase_data(1974, 1000) | |
# wrong thing to do | |
d78 %>% group_by(Age) %>% | |
summarize(P = sum(OB) / sum(PA)) -> Sout | |
ggplot(Sout, aes(Age, P)) + | |
geom_point() + | |
geom_smooth(method = 'loess') + | |
centertitle() + | |
increasefont() + | |
ggtitle("OBP by Age") | |
# actually data looks like this | |
ggplot(d78, aes(Age, OB / PA, color = nameLast)) + | |
geom_point() + | |
increasefont() | |
# compute individual quadratic fits | |
d78_b <- compute_individual_regressions(d78) | |
# graph Jeter's trajectory and fit | |
ggplot(filter(d78_b, nameLast == "Jeter"), | |
aes(Age, OB / PA)) + | |
geom_point(size = 2) + | |
geom_line(aes(Age, P), color = "red", size = 1.5) + | |
ggtitle("Derek Jeter's OBP Trajectory") + | |
increasefont() + centertitle() | |
# fit multilevel model | |
fit <- brm(OB | trials(PA) ~ AgeD + I(AgeD ^ 2) + | |
(AgeD + I(AgeD ^ 2) | Player), | |
data = filter(d78, PA > 0), | |
family = binomial("logit")) | |
# extract posterior means of regression coefficients for | |
# all players | |
Player_Fits <- coef(fit)$Player[, "Estimate", ] %>% | |
as.data.frame() %>% mutate(Player = 1:max(d78_b$Player)) | |
# merge these estimates with our main dataset | |
d78_new <- inner_join(d78_b, Player_Fits, by = "Player") | |
# find estimates of OBP probs at each age | |
d78_new %>% | |
mutate(PH = plogis(Intercept + AgeD.y * AgeD.x + | |
IAgeDE2 * AgeD.x ^ 2)) -> d78_new | |
# graphs of individual fits | |
ggplot(d78_new, aes(Age, P, group = Player)) + | |
geom_line(color = "blue") + | |
ggtitle("Individual Trajectory Fits") + | |
centertitle() + | |
increasefont() | |
# graphs of individual and multilevel fits | |
ggplot(d78_new, aes(Age, P, group = Player)) + | |
geom_line(alpha = 0.3, color = "blue") + | |
geom_line(aes(Age, PH, group = Player), color = "red") + | |
ggtitle("Individual and Multilevel Fits") + | |
centertitle() + | |
increasefont() | |
# find peak ages using two models | |
d78_new %>% group_by(Player) %>% | |
summarize(B0 = first(B0), | |
B1 = first(B1), | |
B2 = first(B2), | |
b0 = first(Intercept), | |
b1 = first(AgeD.y), | |
b2 = first(IAgeDE2)) %>% | |
mutate(Ind_Peak_Age = 30 - B1 / 2 / B2, | |
MLM_Peak_Age = 30 - b1 / 2 / b2) %>% | |
select(Ind_Peak_Age, MLM_Peak_Age) %>% | |
pivot_longer(everything(), | |
names_to = "Type", | |
values_to = "Age") -> S | |
# graph two sets of peak ages | |
ggplot(S, aes(Type, Age)) + | |
geom_point() + | |
coord_flip() + | |
increasefont() + | |
ggtitle("Estimates of Peak Age") + | |
centertitle() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
compute_individual_regressions <- function(d1){ | |
library(tidyverse) | |
library(broom) | |
regressions <- d1 %>% | |
split(pull(., nameLast)) %>% | |
map(~ glm(cbind(OB, PA - OB) ~ AgeD + | |
I(AgeD ^ 2), | |
family = binomial, data = .)) %>% | |
map_df(tidy, .id = "Name") %>% | |
as_tibble() %>% select(Name, term, estimate) | |
# compute predicted probabilities for all seasons and players | |
regressions %>% spread(term, estimate) -> regs | |
names(regs) <- c("Name", "B0", "B1", "B2") | |
inner_join(d1, regs, by=c("nameLast"="Name")) -> d1 | |
d1 %>% | |
mutate(LP = B0 + B1 * AgeD + | |
B2 * AgeD ^ 2, | |
P = exp(LP) / (1 + exp(LP))) -> d1 | |
d1 | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
get_onbase_data <- function(byear, TABB){ | |
library(Lahman) | |
library(tidyverse) | |
# pull out players born in the specific year | |
Master %>% filter(birthYear %in% byear) %>% | |
select(playerID) %>% pull() -> player_list | |
# collect hitters with at least TABB at-bats | |
Batting %>% filter(playerID %in% player_list) %>% | |
group_by(playerID) %>% | |
summarize(AB = sum(AB)) -> S1 | |
S1 %>% | |
filter(AB >= TABB) %>% | |
pull(playerID) -> player_list2000 | |
# collect stats for all players for all seasons | |
Batting %>% filter(playerID %in% player_list2000) %>% | |
group_by(playerID, yearID) %>% | |
summarize(AB = sum(AB), | |
BB = sum(BB), | |
HBP = sum(HBP), | |
SF = sum(SF), | |
H = sum(H), | |
OB = H + BB + HBP, | |
PA = AB + BB + HBP + SF) -> d | |
# define age variable, and age deviation from 30 | |
select(Master, playerID, birthYear, birthMonth) %>% | |
inner_join(d, by=c("playerID")) %>% | |
mutate(birthyear = ifelse(birthMonth >= 7, | |
birthYear + 1, birthYear), | |
Age = yearID - birthyear, | |
AgeD = Age - 30) %>% | |
select(playerID, Age, AgeD, PA, OB) -> d1 | |
# add names to data frame and create numerical id for | |
# player using factors | |
d1 %>% inner_join(select(Master, playerID, nameFirst, | |
nameLast), | |
by = "playerID") %>% | |
mutate(Name = paste(nameFirst, nameLast)) %>% | |
select(nameLast, Age, AgeD, PA, OB) -> d1 | |
d1$Player <- as.numeric(as.factor(d1$nameLast)) | |
d1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment