Skip to content

Instantly share code, notes, and snippets.

@bayesball
Created April 23, 2023 22:18
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/0cba21a76e6574d136a3ffcfa09b8def to your computer and use it in GitHub Desktop.
Save bayesball/0cba21a76e6574d136a3ffcfa09b8def to your computer and use it in GitHub Desktop.
R work for Cluster Luck Scoring in 2022 Season
# read in two functions
source("generate_runs.R")
source("retro_game_work.R")
library(dplyr)
# Phillies game plays on April 16, 2023
plays <- c("HR", "1B", "BB", "BB", "1B", "OUT", "1B",
"2B", "1B", "OUT", "1B", "2B", "OUT", "OUT",
"2B", "OUT", "OUT", "OUT", "OUT", "1B", "BB", "1B",
"1B", "OUT", "2B", "1B", "BB", "OUT", "BB", "OUT",
"OUT", "2B", "OUT", "1B", "OUT", "OUT", "OUT", "BB",
"OUT", "OUT", "OUT", "HR", "OUT", "OUT", "OUT", "1B",
"OUT", "1B", "OUT", "1B", "1B", "OUT", "OUT", "1B",
"1B", "OUT")
# this function will compute the expected runs scored
# using the run scoring model
generate_runs(plays)
### d2022 contains all Retrosheet play by play in 2022 season
### "KCA202206100" is one game id
# this function will compute the expected runs for both teams
# for this particular game defined by the game id
retro_game_work("KCA202206100", d2022)
generate_runs <- function(observed_plays,
iter = 100){
require(dplyr)
# given vector of plays, simulates iter games
# outputs the mean runs scored from simulation model
runs_setup <- function(){
# sets up runner advancement in simulation
# of half inning
# based on 2015 season data
Prob_Single <- matrix(0, 8, 8)
dimnames(Prob_Single)[[1]] <- c("000", "100", "010", "001",
"110", "101", "011", "111")
dimnames(Prob_Single)[[2]] <- c("000", "100", "010", "001",
"110", "101", "011", "111")
Prob_Single["000", "100"] <- 1
Prob_Single["100", c("101", "110")] <-
round(c(1411, 3940) / (1411 + 3940), 3)
Prob_Single["010", c("100", "101")] <-
round(c(1044, 768) / (1044 + 768), 3)
Prob_Single["001", "100"] <- 1
Prob_Single["110", c("101", "110", "111")] <-
round(c(403, 605, 675) / (403 + 605 + 675), 3)
Prob_Single["101", c("101", "110")] <-
round(c(178, 583) / (178 + 583), 3)
Prob_Single["011", c("100", "101")] <-
round(c(215, 200) / (215 + 200), 3)
Prob_Single["111", c("101", "110", "111")] <-
round(c(137, 211, 266) / (137 + 211 + 266), 3)
Prob_Double <- matrix(0, 8, 8)
dimnames(Prob_Double)[[1]] <- c("000", "100", "010", "001",
"110", "101", "011", "111")
dimnames(Prob_Double)[[2]] <- c("000", "100", "010", "001",
"110", "101", "011", "111")
Prob_Double[c("000", "010", "001", "011"), "010"] <- 1
Prob_Double["100", c("011", "010")] <-
round(c(817, 591) / (817 + 591), 3)
Prob_Double["110", c("010", "011")] <-
round(c(212, 260) / (212 + 260), 3)
Prob_Double["101", c("010", "011")] <-
round(c(107, 132) / (107 + 132), 3)
Prob_Double["111", c("010", "011")] <-
round(c(74, 95) / (74 + 95), 3)
Prob_Walk <- matrix(0, 8, 8)
dimnames(Prob_Walk)[[1]] <- c("000", "100", "010", "001",
"110", "101", "011", "111")
dimnames(Prob_Walk)[[2]] <- c("000", "100", "010", "001",
"110", "101", "011", "111")
Prob_Walk["000", "100"] <- 1
Prob_Walk["100", "110"] <- 1
Prob_Walk["010", "110"] <- 1
Prob_Walk["001", "101"] <- 1
Prob_Walk[c("110", "101", "011", "111"), "111"] <- 1
# outcomes are out, walk, single, double, triple, home run
prob <- c(86304 + 37446, 13122 + 951 + 1602,
28016, 8241, 939, 4909)
prob <- prob / sum(prob)
names(prob) <- c("OUT", "BB", "1B", "2B", "3B", "HR")
list(Prob_Single=Prob_Single, Prob_Double=Prob_Double,
Prob_Walk=Prob_Walk, prob=prob)
}
game_runs <- function(plays, st){
# simulates runs scored in a game given a list
# of plays and the run advancement data in st
runs_scored <- function(event_list, setup){
runs.transition <- function(s1, s2){
before <- sum(as.numeric(unlist(strsplit(s1, split=""))))
after <- sum(as.numeric(unlist(strsplit(s2, split=""))))
before - after + 1
}
outs <- 0
bases <- "000"
runs <- 0
all_bases <- c("000", "100", "010", "001",
"110", "101", "011", "111")
for(event in event_list){
if (event=="1B") new_bases <- sample(all_bases, 1,
prob=setup$Prob_Single[bases, ])
if (event=="2B") new_bases <- sample(all_bases, 1,
prob=setup$Prob_Double[bases, ])
if (event=="BB") new_bases <- sample(all_bases, 1,
prob=setup$Prob_Walk[bases, ])
if (event=="3B") new_bases <- "001"
if (event=="HR") new_bases <- "000"
if (event=="OUT") new_bases <- bases
outs <- outs + (event == "OUT")
runs <- runs - (event == "OUT") +
runs.transition(bases, new_bases)
bases <- new_bases
}
runs}
# last play will be "OUT"
n <- length(plays)
plays <- c(sample(plays[-n]), plays[n])
n_outs <- sum(plays == "OUT")
n_innings <- ceiling(n_outs / 3)
inning <- floor((cumsum(plays == "OUT") - .5) / 3) + 1
one.inning <- function(j, st){
runs_scored(plays[inning == j], st)
}
sum(sapply(1:n_innings, one.inning, st))
}
st <- runs_setup()
mean(replicate(iter,
game_runs(observed_plays, st)))
}
retro_game_work <- function(game_id, retrodata){
require(dplyr)
# given some retrosheet data and the game id
# collects the observed sequence of plays for
# each home and visiting teams
# also finds the mean runs scored from the simulation
# model for 100 iterations
collect_retro_sequence <- function(retrodata,
game_id,
home_away){
# given retrosheet data, the game id, and
# home/visitor indicator
# collects the on-base data together with outs
# for the particular team in that game
plays <- c(14, 15, 16, 20, 21, 22, 23)
retrodata %>%
dplyr::filter(GAME_ID == game_id,
BAT_HOME_ID == home_away) -> rdata
rdata %>%
dplyr::filter(EVENT_CD %in% plays) %>%
select(EVENT_CD) %>% pull() ->
observed_on_base
team <- ifelse(home_away == 1,
as.character(substr(rdata$GAME_ID[1], 1, 3)),
as.character(rdata$AWAY_TEAM_ID[1]))
n_extra <- sum(observed_on_base >= 21)
observed_on_base <-
case_match(as.character(observed_on_base),
"14" ~ "BB",
"15" ~ "BB",
"16" ~ "BB",
"20" ~ "1B",
"21" ~ "2B",
"22" ~ "3B",
"23" ~ "HR")
n_outs <- sum(rdata$EVENT_OUTS_CT)
data.frame(Game_Id = game_id,
Team = team,
On_Base = length(observed_on_base),
X_Hit = n_extra,
N_Outs = n_outs,
Sequence = c(observed_on_base,
rep("OUT", n_outs)))
}
h_plays <- collect_retro_sequence(retrodata, game_id, 1)
v_plays <- collect_retro_sequence(retrodata, game_id, 0)
runs <- dplyr::filter(retrodata,
GAME_ID == game_id) %>%
group_by(BAT_HOME_ID) %>%
summarize(R = sum((BAT_DEST_ID > 3) +
(RUN1_DEST_ID > 3) +
(RUN2_DEST_ID > 3) +
(RUN3_DEST_ID > 3)))
h_expected <- ifelse(floor(h_plays$N_Out[1] / 3) ==
h_plays$N_Out[1] / 3,
generate_runs(h_plays$Sequence), NA)
v_expected <- ifelse(floor(v_plays$N_Out[1] / 3) ==
v_plays$N_Out[1] / 3,
generate_runs(v_plays$Sequence), NA)
df1 <- data.frame(Game_Id = h_plays$Game_Id[1],
Team = h_plays$Team[1],
Runs = runs$R[runs$BAT_HOME_ID == 1],
Expected = h_expected,
Outs = h_plays$N_Out[1],
On_Base = h_plays$On_Base[1],
X_Base = h_plays$X_Hit[1])
df2 <- data.frame(Game_Id = v_plays$Game_Id[1],
Team = v_plays$Team[1],
Runs = runs$R[runs$BAT_HOME_ID == 0],
Expected = v_expected,
Outs = v_plays$N_Out[1],
On_Base = v_plays$On_Base[1],
X_Base = v_plays$X_Hit[1])
rbind(df1, df2)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment