Created
September 16, 2022 12:29
-
-
Save bayesball/969f0de0cc54803d28a4dbb8f909a287 to your computer and use it in GitHub Desktop.
R function to compare five methods in predicting future home run rates
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
prediction_work <- function(seasons, | |
mPA = 1000, | |
retrodata, | |
mPA_season = 200){ | |
# seasons is a vector of previous seasons | |
# mPA is the minimum number of cumulative PA | |
# retrodata - Retrosheet data for current season | |
# mPA_season - minimum number of PA in both | |
# halves of current season | |
require(lubridate) | |
require(Lahman) | |
require(dplyr) | |
get_players <- function(seasons, mPA){ | |
Batting %>% | |
filter(yearID %in% seasons) %>% | |
group_by(playerID, yearID) %>% | |
summarize(PA = sum(AB + BB + HBP), | |
HR = sum(HR), | |
.groups = "drop") -> S | |
S %>% | |
group_by(playerID) %>% | |
summarize(HR = sum(HR), | |
PA = sum(PA)) %>% | |
filter(PA >= mPA) %>% | |
inner_join(People, by = "playerID") %>% | |
select(retroID, HR, PA) | |
} | |
get_players(seasons, mPA) -> players1000 | |
retrodata %>% | |
filter(BAT_EVENT_FL == TRUE) %>% | |
mutate(Year = substr(GAME_ID, 4, 7), | |
MonthDay = substr(GAME_ID, 8, 11), | |
Date = mdy(paste(MonthDay, Year, sep = ""))) -> | |
retrodata | |
midseason <- mdy(paste("07-01-", retrodata$Year[1], | |
sep = "")) | |
da <- filter(retrodata, Date < midseason) | |
db <- filter(retrodata, Date >= midseason) | |
da %>% | |
group_by(BAT_ID) %>% | |
summarize(PA = n(), | |
HR = sum(EVENT_CD == 23)) %>% | |
mutate(Period = "First Half") -> Sa | |
db %>% | |
group_by(BAT_ID) %>% | |
summarize(PA = n(), | |
HR = sum(EVENT_CD == 23)) %>% | |
mutate(Period = "Second Half") -> Sb | |
Sab <- inner_join(Sa, Sb, by = "BAT_ID") %>% | |
filter(PA.x >= mPA_season, | |
PA.y >= mPA_season) | |
inner_join(Sab, players1000, | |
by = c("BAT_ID" = "retroID")) -> Sab1 | |
# compute five estimates | |
Sab1 %>% | |
mutate(Rate.x = HR.x / PA.x, | |
Rate.y = HR.y / PA.y, | |
Rate.old = HR / PA, | |
Rate.20 = .20 * Rate.old + .80 * Rate.x, | |
Rate.50 = .50 * Rate.old + .50 * Rate.x, | |
Rate.80 = .80 * Rate.old + .20 * Rate.x) -> Sab1 | |
# compute mean absolute prediction error | |
Sab1 %>% | |
summarize(Current = mean(abs(Rate.x - Rate.y)), | |
Old = mean(abs(Rate.old - Rate.y)), | |
Mix.20 = mean(abs(Rate.20 - Rate.y)), | |
Mix.50 = mean(abs(Rate.50 - Rate.y)), | |
Mix.80 = mean(abs(Rate.80 - Rate.y))) %>% | |
mutate(Season = retrodata$Year[1]) %>% | |
select(Season, Current, Old, | |
Mix.20, Mix.50, Mix.80) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment