Skip to content

Instantly share code, notes, and snippets.

@bayesball bayesball/plot.prob.home.R
Last active Jan 28, 2019

Embed
What would you like to do?
R function to compute and plot the probability a MLB home team wins a game at the end of each inning given a particular lead.
plot.prob.home <- function(year, plot=TRUE){
require(arm)
require(ggplot2)
load.gamelog <- function(season, headers){
download.file(
url=paste("http://www.retrosheet.org/gamelogs/gl", season, ".zip"
, sep="")
, destfile=paste("gl", season, ".zip", sep="")
)
unzip(paste("gl", season, ".zip", sep=""))
gamelog <- read.table(paste("gl", season, ".txt", sep="")
, sep=",", stringsAsFactors=F)
names(gamelog) <- names(headers)
file.remove(paste("gl", season, ".zip", sep=""))
file.remove(paste("gl", season, ".txt", sep=""))
gamelog
}
headers <- read.csv("https://raw.githubusercontent.com/beanumber/baseball_R/master/data/game_log_header.csv")
d <- load.gamelog(year, headers)
# remove line scores for games when at least 10 runs are scored
d <- subset(d, !grepl("\\(", as.character(d$VisitorLineScore)) &
!grepl("\\(", as.character(d$HomeLineScore)))
# for a particular game, extract individual inning runs for
# visitor and home teams
get.visitor.innings <- function(j)
as.numeric(strsplit(as.character(d$VisitorLineScore[j]),
split="")[[1]])[1:8]
get.home.innings <- function(j)
as.numeric(strsplit(as.character(d$HomeLineScore[j]),
split="")[[1]])[1:8]
# apply these functions for all games
options(warn=-1) # turn off warnings
N <- dim(d)[1]
V <- t(sapply(1:N, get.visitor.innings))
H <- t(sapply(1:N, get.home.innings))
options(warn=0) # turn on warnings
# compute running scores of visitor and home teams
C.V <- t(apply(V, 1, cumsum))
C.H <- t(apply(H, 1, cumsum))
# compute running scores for each of 16 half-innings
mf <- function(y) rep(y, each=2)
CC.V <- t(apply(C.V, 1, mf))
nf <- function(y) as.vector(matrix(c(0, y[1:(length(y) - 1)], y),
2, length(y), byrow=TRUE))
CC.H <- t(apply(C.H, 1, nf))
# compute game outcome (1 if home team wins, 0 otherwise)
O <- with(d, ifelse(HomeRunsScore > VisitorRunsScored, 1, 0))
# run the logistic regression
logistic.fit <- function(half.inning){
visitor.runs <- CC.V[, half.inning]
home.runs <- CC.H[, half.inning]
B <- data.frame(Run.diff = home.runs - visitor.runs,
Outcome = O)
coef(glm(Outcome ~ Run.diff, data=B, family=binomial))
}
S <- t(sapply(1:16, logistic.fit))
D <- data.frame(Inning=0,
Run.Lead=0,
Prob.Win=mean(O))
for (r in -4:4){
hi <- seq(2, 16, 2)
D <- rbind(D, data.frame(Inning=hi/2,
Run.Lead=r,
Prob.Win=invlogit(S[hi, 1] + r * S[hi, 2])))
}
if(plot==TRUE){
D$RUN.LEAD <- as.factor(D$Run.Lead)
print(ggplot(D, aes(Inning, Prob.Win, color=RUN.LEAD)) +
geom_point(size=4) + geom_line(size=1.5) +
labs(title=paste("Probability Home Team Wins At End of Each Inning:",
year, "Data")) +
ylab("Probability Home Team Wins") +
ylim(0, 1) + geom_hline(yintercept=0.5, size=1.5))}
D$Season <- year
D
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.