Skip to content

Instantly share code, notes, and snippets.

@bayesball bayesball/R markdown file
Last active Apr 2, 2019

Embed
What would you like to do?
R functions for computing win probabilities
compute.win.probs <- function(d, S){
# adds variables P.OLD, P.NEW, and WPA
# to retrosheet data with run expectancies
invlogit <- function(x) exp(x) / (1 + exp(x))
half.inning.row <- 2 * d$INN_CT + d$BAT_HOME_ID
runs0 <- ifelse(d$BAT_HOME_ID == 1,
d$HOME_SCORE_CT - d$AWAY_SCORE_CT + d$RUNS.STATE,
d$HOME_SCORE_CT - d$AWAY_SCORE_CT - d$RUNS.STATE)
d$P.OLD <- invlogit(S[half.inning.row, "Beta0"] +
S[half.inning.row, "Beta1"] * runs0)
runs1 <- ifelse(d$BAT_HOME_ID == 1,
d$HOME_SCORE_CT - d$AWAY_SCORE_CT +
d$RUNS.NEW.STATE + d$RUNS.SCORED,
d$HOME_SCORE_CT - d$AWAY_SCORE_CT -
d$RUNS.NEW.STATE - d$RUNS.SCORED)
d$P.NEW <- invlogit(S[half.inning.row, "Beta0"] +
S[half.inning.row, "Beta1"] * runs1)
d$WPA <- with(d, P.NEW - P.OLD)
d}
prob.win.game2 <- function(season){
# for each half-inning of game through 8th inning
# outputs coefficients of logistic fit to
# prob home team wins given home team leads by x runs
# uses data from season
# revised to handle half innings
source("download.and.read.R")
d <- download.and.read(season)
# create Runs.game data frame
library(dplyr)
Runs.game.home <- summarize(group_by(filter(d$pbp, BAT_HOME_ID==1),
GAME_ID),
Home.Runs = sum(RUN1_DEST_ID >= 4) +
sum(RUN2_DEST_ID >= 4) +
sum(RUN3_DEST_ID >= 4) +
sum(BAT_DEST_ID >= 4) )
Runs.game.visitor <- summarize(group_by(filter(d$pbp, BAT_HOME_ID==0),
GAME_ID),
Home.Runs = sum(RUN1_DEST_ID >= 4) +
sum(RUN2_DEST_ID >= 4) +
sum(RUN3_DEST_ID >= 4) +
sum(BAT_DEST_ID >= 4) )
Runs.game <- merge(Runs.game.home, Runs.game.visitor,
by="GAME_ID")
names(Runs.game) <- c("GAME_ID", "Home.Runs", "Visitor.Runs")
# compute runs scored for each inning for home and away teams
library(dplyr)
Runs.inning.home <- summarize(group_by(filter(d$pbp,
BAT_HOME_ID==1), GAME_ID, INN_CT),
Runs = sum(RUN1_DEST_ID >= 4) +
sum(RUN2_DEST_ID >= 4) +
sum(RUN3_DEST_ID >= 4) +
sum(BAT_DEST_ID >= 4))
# new to incorporate half-innings
Runs.inning.home <- rbind(data.frame(Runs.inning.home[, 1:2],
Runs=0, Half=0),
data.frame(Runs.inning.home,
Half=1))
Runs.inning.home$Half.Inning <- with(Runs.inning.home,
2 * INN_CT + Half - 1)
Runs.inning.home <- Runs.inning.home[with(Runs.inning.home,
order(GAME_ID, INN_CT, Half.Inning)), ]
Runs.inning.visitor <- summarize(group_by(filter(d$pbp,
BAT_HOME_ID==0), GAME_ID, INN_CT),
Runs = sum(RUN1_DEST_ID >= 4) +
sum(RUN2_DEST_ID >= 4) +
sum(RUN3_DEST_ID >= 4) +
sum(BAT_DEST_ID >= 4))
# new to incorporate half-innings
Runs.inning.visitor <- rbind(data.frame(Runs.inning.visitor,
Half=0),
data.frame(Runs.inning.visitor[, 1:2],
Runs=0, Half=1))
Runs.inning.visitor$Half.Inning <- with(Runs.inning.visitor,
2 * INN_CT + Half - 1)
Runs.inning.visitor <- Runs.inning.visitor[with(Runs.inning.visitor,
order(GAME_ID, INN_CT, Half.Inning)), ]
# find running scores for each half-inning for home and away
library(plyr)
CRuns.inning.home <- ddply(Runs.inning.home, .(GAME_ID),
summarize, Half.Inning=Half.Inning,
CRuns=cumsum(Runs))
CRuns.inning.visitor <- ddply(Runs.inning.visitor, .(GAME_ID),
summarize, Half.Inning=Half.Inning,
CRuns=cumsum(Runs))
CRuns.inning <- merge(CRuns.inning.home, CRuns.inning.visitor,
by=c("GAME_ID", "Half.Inning"))
names(CRuns.inning) <- c("GAME_ID", "Half.Inning",
"Home.Runs", "Visitor.Runs")
CRuns.inning <- CRuns.inning[with(CRuns.inning,
order(GAME_ID, Half.Inning)), ]
detach("package:plyr", unload=TRUE)
# merge the inning and game data frames
Final <- merge(Runs.game, CRuns.inning,
by="GAME_ID")
names(Final) <- c("GAME_ID", "Final.Home", "Final.Visitor",
"Half.Inning",
"Home", "Visitor")
# add difference and game result variables
Final$Difference <- with(Final, Home - Visitor)
Final$Result <- with(Final,
ifelse(Final.Home > Final.Visitor, 1, 0))
fit.model <- function(inning)
coef(glm(Result ~ Difference,
data=filter(Final, Half.Inning==inning), family=binomial))
p.home.win <- mean(Final$Result)
S0 <- data.frame(Half.Inning=0,
Beta0=log(p.home.win / (1 - p.home.win)),
Beta1=NA)
S <- sapply(1:16, fit.model)
S1 <- rbind(S0,
data.frame(Half.Inning=1:16, Beta0=S[1, ], Beta1=S[2, ]))
S1$Inning <- c(0, rep(1:8, each=2))
S1$Half <- ifelse(S1$Half.Inning / 2 == floor(S1$Half.Inning / 2),
"Bottom", "Top")
# fill in additional rows for innings up to 25
S2 <- S1[16:17, ]
S2$Inning <- 9; S2$Half.Inning <- 17:18
for (inning in 10:25){
S3 <- S1[16:17, ]; S3$Inning <- inning
S3$Half.Inning <- c(2 * inning - 1, 2 * inning)
S2 <- rbind(S2, S3)
}
S1 <- rbind(S1, S2)
S1
}
---
title: "WPA work"
author: "Jim Albert"
date: "January 23, 2015"
output: html_document
---
First source three functions:
* prob.win.game2 computes the win probabilities after each inning
* compute.runs.expectancy computes the run values
* compute.win.probs computes the win probabilities after each play
These functions also use the functions download.and.read and parse.retrosheet2.pbp.R -- these two functions download and read the Retrosheet play-by-play data.
Should have all five functions in current working directory. Also working directory should have a folder "download" which contains two subfolders "zipped" and "unzipped".
```{r}
source("prob.win.game2.R")
source("compute.runs.expectancy.R")
source("compute.win.probs.R")
```
Compute the win probabilities of home team after each half inning.
```{r}
S <- prob.win.game2(2014)
```
Outline of WPA calculations:
1. get runs expectancy matrix
2. want P(win | current state)
3. this is approximated by
P(win | L = current.home.lead +- expected.runs)
= invlogit(beta0 + beta1 * L)
where (beta0, beta1) obtained from logistic fit at end of that
half-inning
Compute the run values for all plays from data from the 2014 season:
```{r}
setwd("~/Dropbox/2014 WORK/BLOG Baseball R/31 winprobnew/download.folder/unzipped")
d <- compute.runs.expectancy(2014)
setwd("~/Dropbox/2014 WORK/BLOG Baseball R/31 winprobnew")
```
For each play, compute the probability of home win before the play, after the play, and the difference in win probabilities.
```{r}
d <- compute.win.probs(d, S)
```
We graph the win probs for a specific game in 2014. Write a function to display probabilities for a game of interest.
```{r}
graph.game <- function(d, game.id){
require(ggplot2)
d1 <- subset(d, substr(HALF.INNING, 1, 12) == game.id)
d1$Play <- 1:dim(d1)[1]
yr <- substr(d1[1, "HALF.INNING"], 4, 7)
mo <- substr(d1[1, "HALF.INNING"], 8, 9)
day <- substr(d1[1, "HALF.INNING"], 10, 11)
print(ggplot(d1, aes(Play, P.NEW)) + geom_line() +
ylim(0, 1) + geom_hline(yintercept=0.5, color="red") +
ylab("Probability Home Team Wins") +
labs(title = paste(d1[1, "AWAY_TEAM_ID"], "AT",
substr(d1[1, "HALF.INNING"], 1, 3),
"--", mo, "/", day, "/", yr )))
d1
}
```
Display probabilities of the Philadelphia-Texas game on March 31, 2014.
```{r}
d.game <- graph.game(d, "TBA201404180")
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.