Last active
April 2, 2019 09:53
-
-
Save bayesball/5fc5a1bdae3418483414 to your computer and use it in GitHub Desktop.
R functions for computing win probabilities
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
compute.win.probs <- function(d, S){ | |
# adds variables P.OLD, P.NEW, and WPA | |
# to retrosheet data with run expectancies | |
invlogit <- function(x) exp(x) / (1 + exp(x)) | |
half.inning.row <- 2 * d$INN_CT + d$BAT_HOME_ID | |
runs0 <- ifelse(d$BAT_HOME_ID == 1, | |
d$HOME_SCORE_CT - d$AWAY_SCORE_CT + d$RUNS.STATE, | |
d$HOME_SCORE_CT - d$AWAY_SCORE_CT - d$RUNS.STATE) | |
d$P.OLD <- invlogit(S[half.inning.row, "Beta0"] + | |
S[half.inning.row, "Beta1"] * runs0) | |
runs1 <- ifelse(d$BAT_HOME_ID == 1, | |
d$HOME_SCORE_CT - d$AWAY_SCORE_CT + | |
d$RUNS.NEW.STATE + d$RUNS.SCORED, | |
d$HOME_SCORE_CT - d$AWAY_SCORE_CT - | |
d$RUNS.NEW.STATE - d$RUNS.SCORED) | |
d$P.NEW <- invlogit(S[half.inning.row, "Beta0"] + | |
S[half.inning.row, "Beta1"] * runs1) | |
d$WPA <- with(d, P.NEW - P.OLD) | |
d} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
prob.win.game2 <- function(season){ | |
# for each half-inning of game through 8th inning | |
# outputs coefficients of logistic fit to | |
# prob home team wins given home team leads by x runs | |
# uses data from season | |
# revised to handle half innings | |
source("download.and.read.R") | |
d <- download.and.read(season) | |
# create Runs.game data frame | |
library(dplyr) | |
Runs.game.home <- summarize(group_by(filter(d$pbp, BAT_HOME_ID==1), | |
GAME_ID), | |
Home.Runs = sum(RUN1_DEST_ID >= 4) + | |
sum(RUN2_DEST_ID >= 4) + | |
sum(RUN3_DEST_ID >= 4) + | |
sum(BAT_DEST_ID >= 4) ) | |
Runs.game.visitor <- summarize(group_by(filter(d$pbp, BAT_HOME_ID==0), | |
GAME_ID), | |
Home.Runs = sum(RUN1_DEST_ID >= 4) + | |
sum(RUN2_DEST_ID >= 4) + | |
sum(RUN3_DEST_ID >= 4) + | |
sum(BAT_DEST_ID >= 4) ) | |
Runs.game <- merge(Runs.game.home, Runs.game.visitor, | |
by="GAME_ID") | |
names(Runs.game) <- c("GAME_ID", "Home.Runs", "Visitor.Runs") | |
# compute runs scored for each inning for home and away teams | |
library(dplyr) | |
Runs.inning.home <- summarize(group_by(filter(d$pbp, | |
BAT_HOME_ID==1), GAME_ID, INN_CT), | |
Runs = sum(RUN1_DEST_ID >= 4) + | |
sum(RUN2_DEST_ID >= 4) + | |
sum(RUN3_DEST_ID >= 4) + | |
sum(BAT_DEST_ID >= 4)) | |
# new to incorporate half-innings | |
Runs.inning.home <- rbind(data.frame(Runs.inning.home[, 1:2], | |
Runs=0, Half=0), | |
data.frame(Runs.inning.home, | |
Half=1)) | |
Runs.inning.home$Half.Inning <- with(Runs.inning.home, | |
2 * INN_CT + Half - 1) | |
Runs.inning.home <- Runs.inning.home[with(Runs.inning.home, | |
order(GAME_ID, INN_CT, Half.Inning)), ] | |
Runs.inning.visitor <- summarize(group_by(filter(d$pbp, | |
BAT_HOME_ID==0), GAME_ID, INN_CT), | |
Runs = sum(RUN1_DEST_ID >= 4) + | |
sum(RUN2_DEST_ID >= 4) + | |
sum(RUN3_DEST_ID >= 4) + | |
sum(BAT_DEST_ID >= 4)) | |
# new to incorporate half-innings | |
Runs.inning.visitor <- rbind(data.frame(Runs.inning.visitor, | |
Half=0), | |
data.frame(Runs.inning.visitor[, 1:2], | |
Runs=0, Half=1)) | |
Runs.inning.visitor$Half.Inning <- with(Runs.inning.visitor, | |
2 * INN_CT + Half - 1) | |
Runs.inning.visitor <- Runs.inning.visitor[with(Runs.inning.visitor, | |
order(GAME_ID, INN_CT, Half.Inning)), ] | |
# find running scores for each half-inning for home and away | |
library(plyr) | |
CRuns.inning.home <- ddply(Runs.inning.home, .(GAME_ID), | |
summarize, Half.Inning=Half.Inning, | |
CRuns=cumsum(Runs)) | |
CRuns.inning.visitor <- ddply(Runs.inning.visitor, .(GAME_ID), | |
summarize, Half.Inning=Half.Inning, | |
CRuns=cumsum(Runs)) | |
CRuns.inning <- merge(CRuns.inning.home, CRuns.inning.visitor, | |
by=c("GAME_ID", "Half.Inning")) | |
names(CRuns.inning) <- c("GAME_ID", "Half.Inning", | |
"Home.Runs", "Visitor.Runs") | |
CRuns.inning <- CRuns.inning[with(CRuns.inning, | |
order(GAME_ID, Half.Inning)), ] | |
detach("package:plyr", unload=TRUE) | |
# merge the inning and game data frames | |
Final <- merge(Runs.game, CRuns.inning, | |
by="GAME_ID") | |
names(Final) <- c("GAME_ID", "Final.Home", "Final.Visitor", | |
"Half.Inning", | |
"Home", "Visitor") | |
# add difference and game result variables | |
Final$Difference <- with(Final, Home - Visitor) | |
Final$Result <- with(Final, | |
ifelse(Final.Home > Final.Visitor, 1, 0)) | |
fit.model <- function(inning) | |
coef(glm(Result ~ Difference, | |
data=filter(Final, Half.Inning==inning), family=binomial)) | |
p.home.win <- mean(Final$Result) | |
S0 <- data.frame(Half.Inning=0, | |
Beta0=log(p.home.win / (1 - p.home.win)), | |
Beta1=NA) | |
S <- sapply(1:16, fit.model) | |
S1 <- rbind(S0, | |
data.frame(Half.Inning=1:16, Beta0=S[1, ], Beta1=S[2, ])) | |
S1$Inning <- c(0, rep(1:8, each=2)) | |
S1$Half <- ifelse(S1$Half.Inning / 2 == floor(S1$Half.Inning / 2), | |
"Bottom", "Top") | |
# fill in additional rows for innings up to 25 | |
S2 <- S1[16:17, ] | |
S2$Inning <- 9; S2$Half.Inning <- 17:18 | |
for (inning in 10:25){ | |
S3 <- S1[16:17, ]; S3$Inning <- inning | |
S3$Half.Inning <- c(2 * inning - 1, 2 * inning) | |
S2 <- rbind(S2, S3) | |
} | |
S1 <- rbind(S1, S2) | |
S1 | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "WPA work" | |
author: "Jim Albert" | |
date: "January 23, 2015" | |
output: html_document | |
--- | |
First source three functions: | |
* prob.win.game2 computes the win probabilities after each inning | |
* compute.runs.expectancy computes the run values | |
* compute.win.probs computes the win probabilities after each play | |
These functions also use the functions download.and.read and parse.retrosheet2.pbp.R -- these two functions download and read the Retrosheet play-by-play data. | |
Should have all five functions in current working directory. Also working directory should have a folder "download" which contains two subfolders "zipped" and "unzipped". | |
```{r} | |
source("prob.win.game2.R") | |
source("compute.runs.expectancy.R") | |
source("compute.win.probs.R") | |
``` | |
Compute the win probabilities of home team after each half inning. | |
```{r} | |
S <- prob.win.game2(2014) | |
``` | |
Outline of WPA calculations: | |
1. get runs expectancy matrix | |
2. want P(win | current state) | |
3. this is approximated by | |
P(win | L = current.home.lead +- expected.runs) | |
= invlogit(beta0 + beta1 * L) | |
where (beta0, beta1) obtained from logistic fit at end of that | |
half-inning | |
Compute the run values for all plays from data from the 2014 season: | |
```{r} | |
setwd("~/Dropbox/2014 WORK/BLOG Baseball R/31 winprobnew/download.folder/unzipped") | |
d <- compute.runs.expectancy(2014) | |
setwd("~/Dropbox/2014 WORK/BLOG Baseball R/31 winprobnew") | |
``` | |
For each play, compute the probability of home win before the play, after the play, and the difference in win probabilities. | |
```{r} | |
d <- compute.win.probs(d, S) | |
``` | |
We graph the win probs for a specific game in 2014. Write a function to display probabilities for a game of interest. | |
```{r} | |
graph.game <- function(d, game.id){ | |
require(ggplot2) | |
d1 <- subset(d, substr(HALF.INNING, 1, 12) == game.id) | |
d1$Play <- 1:dim(d1)[1] | |
yr <- substr(d1[1, "HALF.INNING"], 4, 7) | |
mo <- substr(d1[1, "HALF.INNING"], 8, 9) | |
day <- substr(d1[1, "HALF.INNING"], 10, 11) | |
print(ggplot(d1, aes(Play, P.NEW)) + geom_line() + | |
ylim(0, 1) + geom_hline(yintercept=0.5, color="red") + | |
ylab("Probability Home Team Wins") + | |
labs(title = paste(d1[1, "AWAY_TEAM_ID"], "AT", | |
substr(d1[1, "HALF.INNING"], 1, 3), | |
"--", mo, "/", day, "/", yr ))) | |
d1 | |
} | |
``` | |
Display probabilities of the Philadelphia-Texas game on March 31, 2014. | |
```{r} | |
d.game <- graph.game(d, "TBA201404180") | |
``` | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment