R work for Cluster Luck Scoring in 2022 Season
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# read in two functions | |
source("generate_runs.R") | |
source("retro_game_work.R") | |
library(dplyr) | |
# Phillies game plays on April 16, 2023 | |
plays <- c("HR", "1B", "BB", "BB", "1B", "OUT", "1B", | |
"2B", "1B", "OUT", "1B", "2B", "OUT", "OUT", | |
"2B", "OUT", "OUT", "OUT", "OUT", "1B", "BB", "1B", | |
"1B", "OUT", "2B", "1B", "BB", "OUT", "BB", "OUT", | |
"OUT", "2B", "OUT", "1B", "OUT", "OUT", "OUT", "BB", | |
"OUT", "OUT", "OUT", "HR", "OUT", "OUT", "OUT", "1B", | |
"OUT", "1B", "OUT", "1B", "1B", "OUT", "OUT", "1B", | |
"1B", "OUT") | |
# this function will compute the expected runs scored | |
# using the run scoring model | |
generate_runs(plays) | |
### d2022 contains all Retrosheet play by play in 2022 season | |
### "KCA202206100" is one game id | |
# this function will compute the expected runs for both teams | |
# for this particular game defined by the game id | |
retro_game_work("KCA202206100", d2022) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
generate_runs <- function(observed_plays, | |
iter = 100){ | |
require(dplyr) | |
# given vector of plays, simulates iter games | |
# outputs the mean runs scored from simulation model | |
runs_setup <- function(){ | |
# sets up runner advancement in simulation | |
# of half inning | |
# based on 2015 season data | |
Prob_Single <- matrix(0, 8, 8) | |
dimnames(Prob_Single)[[1]] <- c("000", "100", "010", "001", | |
"110", "101", "011", "111") | |
dimnames(Prob_Single)[[2]] <- c("000", "100", "010", "001", | |
"110", "101", "011", "111") | |
Prob_Single["000", "100"] <- 1 | |
Prob_Single["100", c("101", "110")] <- | |
round(c(1411, 3940) / (1411 + 3940), 3) | |
Prob_Single["010", c("100", "101")] <- | |
round(c(1044, 768) / (1044 + 768), 3) | |
Prob_Single["001", "100"] <- 1 | |
Prob_Single["110", c("101", "110", "111")] <- | |
round(c(403, 605, 675) / (403 + 605 + 675), 3) | |
Prob_Single["101", c("101", "110")] <- | |
round(c(178, 583) / (178 + 583), 3) | |
Prob_Single["011", c("100", "101")] <- | |
round(c(215, 200) / (215 + 200), 3) | |
Prob_Single["111", c("101", "110", "111")] <- | |
round(c(137, 211, 266) / (137 + 211 + 266), 3) | |
Prob_Double <- matrix(0, 8, 8) | |
dimnames(Prob_Double)[[1]] <- c("000", "100", "010", "001", | |
"110", "101", "011", "111") | |
dimnames(Prob_Double)[[2]] <- c("000", "100", "010", "001", | |
"110", "101", "011", "111") | |
Prob_Double[c("000", "010", "001", "011"), "010"] <- 1 | |
Prob_Double["100", c("011", "010")] <- | |
round(c(817, 591) / (817 + 591), 3) | |
Prob_Double["110", c("010", "011")] <- | |
round(c(212, 260) / (212 + 260), 3) | |
Prob_Double["101", c("010", "011")] <- | |
round(c(107, 132) / (107 + 132), 3) | |
Prob_Double["111", c("010", "011")] <- | |
round(c(74, 95) / (74 + 95), 3) | |
Prob_Walk <- matrix(0, 8, 8) | |
dimnames(Prob_Walk)[[1]] <- c("000", "100", "010", "001", | |
"110", "101", "011", "111") | |
dimnames(Prob_Walk)[[2]] <- c("000", "100", "010", "001", | |
"110", "101", "011", "111") | |
Prob_Walk["000", "100"] <- 1 | |
Prob_Walk["100", "110"] <- 1 | |
Prob_Walk["010", "110"] <- 1 | |
Prob_Walk["001", "101"] <- 1 | |
Prob_Walk[c("110", "101", "011", "111"), "111"] <- 1 | |
# outcomes are out, walk, single, double, triple, home run | |
prob <- c(86304 + 37446, 13122 + 951 + 1602, | |
28016, 8241, 939, 4909) | |
prob <- prob / sum(prob) | |
names(prob) <- c("OUT", "BB", "1B", "2B", "3B", "HR") | |
list(Prob_Single=Prob_Single, Prob_Double=Prob_Double, | |
Prob_Walk=Prob_Walk, prob=prob) | |
} | |
game_runs <- function(plays, st){ | |
# simulates runs scored in a game given a list | |
# of plays and the run advancement data in st | |
runs_scored <- function(event_list, setup){ | |
runs.transition <- function(s1, s2){ | |
before <- sum(as.numeric(unlist(strsplit(s1, split="")))) | |
after <- sum(as.numeric(unlist(strsplit(s2, split="")))) | |
before - after + 1 | |
} | |
outs <- 0 | |
bases <- "000" | |
runs <- 0 | |
all_bases <- c("000", "100", "010", "001", | |
"110", "101", "011", "111") | |
for(event in event_list){ | |
if (event=="1B") new_bases <- sample(all_bases, 1, | |
prob=setup$Prob_Single[bases, ]) | |
if (event=="2B") new_bases <- sample(all_bases, 1, | |
prob=setup$Prob_Double[bases, ]) | |
if (event=="BB") new_bases <- sample(all_bases, 1, | |
prob=setup$Prob_Walk[bases, ]) | |
if (event=="3B") new_bases <- "001" | |
if (event=="HR") new_bases <- "000" | |
if (event=="OUT") new_bases <- bases | |
outs <- outs + (event == "OUT") | |
runs <- runs - (event == "OUT") + | |
runs.transition(bases, new_bases) | |
bases <- new_bases | |
} | |
runs} | |
# last play will be "OUT" | |
n <- length(plays) | |
plays <- c(sample(plays[-n]), plays[n]) | |
n_outs <- sum(plays == "OUT") | |
n_innings <- ceiling(n_outs / 3) | |
inning <- floor((cumsum(plays == "OUT") - .5) / 3) + 1 | |
one.inning <- function(j, st){ | |
runs_scored(plays[inning == j], st) | |
} | |
sum(sapply(1:n_innings, one.inning, st)) | |
} | |
st <- runs_setup() | |
mean(replicate(iter, | |
game_runs(observed_plays, st))) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
retro_game_work <- function(game_id, retrodata){ | |
require(dplyr) | |
# given some retrosheet data and the game id | |
# collects the observed sequence of plays for | |
# each home and visiting teams | |
# also finds the mean runs scored from the simulation | |
# model for 100 iterations | |
collect_retro_sequence <- function(retrodata, | |
game_id, | |
home_away){ | |
# given retrosheet data, the game id, and | |
# home/visitor indicator | |
# collects the on-base data together with outs | |
# for the particular team in that game | |
plays <- c(14, 15, 16, 20, 21, 22, 23) | |
retrodata %>% | |
dplyr::filter(GAME_ID == game_id, | |
BAT_HOME_ID == home_away) -> rdata | |
rdata %>% | |
dplyr::filter(EVENT_CD %in% plays) %>% | |
select(EVENT_CD) %>% pull() -> | |
observed_on_base | |
team <- ifelse(home_away == 1, | |
as.character(substr(rdata$GAME_ID[1], 1, 3)), | |
as.character(rdata$AWAY_TEAM_ID[1])) | |
n_extra <- sum(observed_on_base >= 21) | |
observed_on_base <- | |
case_match(as.character(observed_on_base), | |
"14" ~ "BB", | |
"15" ~ "BB", | |
"16" ~ "BB", | |
"20" ~ "1B", | |
"21" ~ "2B", | |
"22" ~ "3B", | |
"23" ~ "HR") | |
n_outs <- sum(rdata$EVENT_OUTS_CT) | |
data.frame(Game_Id = game_id, | |
Team = team, | |
On_Base = length(observed_on_base), | |
X_Hit = n_extra, | |
N_Outs = n_outs, | |
Sequence = c(observed_on_base, | |
rep("OUT", n_outs))) | |
} | |
h_plays <- collect_retro_sequence(retrodata, game_id, 1) | |
v_plays <- collect_retro_sequence(retrodata, game_id, 0) | |
runs <- dplyr::filter(retrodata, | |
GAME_ID == game_id) %>% | |
group_by(BAT_HOME_ID) %>% | |
summarize(R = sum((BAT_DEST_ID > 3) + | |
(RUN1_DEST_ID > 3) + | |
(RUN2_DEST_ID > 3) + | |
(RUN3_DEST_ID > 3))) | |
h_expected <- ifelse(floor(h_plays$N_Out[1] / 3) == | |
h_plays$N_Out[1] / 3, | |
generate_runs(h_plays$Sequence), NA) | |
v_expected <- ifelse(floor(v_plays$N_Out[1] / 3) == | |
v_plays$N_Out[1] / 3, | |
generate_runs(v_plays$Sequence), NA) | |
df1 <- data.frame(Game_Id = h_plays$Game_Id[1], | |
Team = h_plays$Team[1], | |
Runs = runs$R[runs$BAT_HOME_ID == 1], | |
Expected = h_expected, | |
Outs = h_plays$N_Out[1], | |
On_Base = h_plays$On_Base[1], | |
X_Base = h_plays$X_Hit[1]) | |
df2 <- data.frame(Game_Id = v_plays$Game_Id[1], | |
Team = v_plays$Team[1], | |
Runs = runs$R[runs$BAT_HOME_ID == 0], | |
Expected = v_expected, | |
Outs = v_plays$N_Out[1], | |
On_Base = v_plays$On_Base[1], | |
X_Base = v_plays$X_Hit[1]) | |
rbind(df1, df2) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment