Skip to content

Instantly share code, notes, and snippets.

@bayesball
Created September 16, 2022 12:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/969f0de0cc54803d28a4dbb8f909a287 to your computer and use it in GitHub Desktop.
Save bayesball/969f0de0cc54803d28a4dbb8f909a287 to your computer and use it in GitHub Desktop.
R function to compare five methods in predicting future home run rates
prediction_work <- function(seasons,
mPA = 1000,
retrodata,
mPA_season = 200){
# seasons is a vector of previous seasons
# mPA is the minimum number of cumulative PA
# retrodata - Retrosheet data for current season
# mPA_season - minimum number of PA in both
# halves of current season
require(lubridate)
require(Lahman)
require(dplyr)
get_players <- function(seasons, mPA){
Batting %>%
filter(yearID %in% seasons) %>%
group_by(playerID, yearID) %>%
summarize(PA = sum(AB + BB + HBP),
HR = sum(HR),
.groups = "drop") -> S
S %>%
group_by(playerID) %>%
summarize(HR = sum(HR),
PA = sum(PA)) %>%
filter(PA >= mPA) %>%
inner_join(People, by = "playerID") %>%
select(retroID, HR, PA)
}
get_players(seasons, mPA) -> players1000
retrodata %>%
filter(BAT_EVENT_FL == TRUE) %>%
mutate(Year = substr(GAME_ID, 4, 7),
MonthDay = substr(GAME_ID, 8, 11),
Date = mdy(paste(MonthDay, Year, sep = ""))) ->
retrodata
midseason <- mdy(paste("07-01-", retrodata$Year[1],
sep = ""))
da <- filter(retrodata, Date < midseason)
db <- filter(retrodata, Date >= midseason)
da %>%
group_by(BAT_ID) %>%
summarize(PA = n(),
HR = sum(EVENT_CD == 23)) %>%
mutate(Period = "First Half") -> Sa
db %>%
group_by(BAT_ID) %>%
summarize(PA = n(),
HR = sum(EVENT_CD == 23)) %>%
mutate(Period = "Second Half") -> Sb
Sab <- inner_join(Sa, Sb, by = "BAT_ID") %>%
filter(PA.x >= mPA_season,
PA.y >= mPA_season)
inner_join(Sab, players1000,
by = c("BAT_ID" = "retroID")) -> Sab1
# compute five estimates
Sab1 %>%
mutate(Rate.x = HR.x / PA.x,
Rate.y = HR.y / PA.y,
Rate.old = HR / PA,
Rate.20 = .20 * Rate.old + .80 * Rate.x,
Rate.50 = .50 * Rate.old + .50 * Rate.x,
Rate.80 = .80 * Rate.old + .20 * Rate.x) -> Sab1
# compute mean absolute prediction error
Sab1 %>%
summarize(Current = mean(abs(Rate.x - Rate.y)),
Old = mean(abs(Rate.old - Rate.y)),
Mix.20 = mean(abs(Rate.20 - Rate.y)),
Mix.50 = mean(abs(Rate.50 - Rate.y)),
Mix.80 = mean(abs(Rate.80 - Rate.y))) %>%
mutate(Season = retrodata$Year[1]) %>%
select(Season, Current, Old,
Mix.20, Mix.50, Mix.80)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment