bayesball/R markdown file

## compute.win.probs.R
compute.win.probs <- function(d, S){
  # adds variables P.OLD, P.NEW, and WPA
  # to retrosheet data with run expectancies
  invlogit <- function(x) exp(x) / (1 + exp(x))

  half.inning.row <- 2 * d$INN_CT + d$BAT_HOME_ID
  runs0 <- ifelse(d$BAT_HOME_ID == 1,
                  d$HOME_SCORE_CT - d$AWAY_SCORE_CT + d$RUNS.STATE,
                  d$HOME_SCORE_CT - d$AWAY_SCORE_CT - d$RUNS.STATE)
  d$P.OLD <- invlogit(S[half.inning.row, "Beta0"] +
                        S[half.inning.row, "Beta1"] * runs0)
  runs1 <- ifelse(d$BAT_HOME_ID == 1,
                  d$HOME_SCORE_CT - d$AWAY_SCORE_CT +
                    d$RUNS.NEW.STATE + d$RUNS.SCORED,
                  d$HOME_SCORE_CT - d$AWAY_SCORE_CT -
                    d$RUNS.NEW.STATE - d$RUNS.SCORED)
  d$P.NEW <- invlogit(S[half.inning.row, "Beta0"] +
                        S[half.inning.row, "Beta1"] * runs1)

  d$WPA <- with(d, P.NEW - P.OLD)
  d}

## prob.win.game2.R
prob.win.game2 <- function(season){

# for each half-inning of game through 8th inning
# outputs coefficients of logistic fit to
# prob home team wins given home team leads by x runs
# uses data from season

# revised to handle half innings

source("download.and.read.R")

d <- download.and.read(season)

# create Runs.game data frame

library(dplyr)
Runs.game.home <- summarize(group_by(filter(d$pbp, BAT_HOME_ID==1),
                                     GAME_ID),
                            Home.Runs = sum(RUN1_DEST_ID >= 4) +
                              sum(RUN2_DEST_ID >= 4) +
                              sum(RUN3_DEST_ID >= 4) +
                              sum(BAT_DEST_ID >= 4) )
Runs.game.visitor <- summarize(group_by(filter(d$pbp, BAT_HOME_ID==0),
                                        GAME_ID),
                               Home.Runs = sum(RUN1_DEST_ID >= 4) +
                                 sum(RUN2_DEST_ID >= 4) +
                                 sum(RUN3_DEST_ID >= 4) +
                                 sum(BAT_DEST_ID >= 4) )
Runs.game <- merge(Runs.game.home, Runs.game.visitor,
                   by="GAME_ID")
names(Runs.game) <- c("GAME_ID", "Home.Runs", "Visitor.Runs")

# compute runs scored for each inning for home and away teams

library(dplyr)
Runs.inning.home <- summarize(group_by(filter(d$pbp,
                  BAT_HOME_ID==1), GAME_ID, INN_CT),
               Runs = sum(RUN1_DEST_ID >= 4) +
                      sum(RUN2_DEST_ID >= 4) +
                      sum(RUN3_DEST_ID >= 4) +
                      sum(BAT_DEST_ID >= 4))
# new to incorporate half-innings
Runs.inning.home <- rbind(data.frame(Runs.inning.home[, 1:2],
                                    Runs=0, Half=0),
                         data.frame(Runs.inning.home,
                                    Half=1))
Runs.inning.home$Half.Inning <- with(Runs.inning.home,
                               2 * INN_CT + Half - 1)
Runs.inning.home <- Runs.inning.home[with(Runs.inning.home,
                      order(GAME_ID, INN_CT, Half.Inning)), ]

Runs.inning.visitor <- summarize(group_by(filter(d$pbp,
                  BAT_HOME_ID==0), GAME_ID, INN_CT),
               Runs = sum(RUN1_DEST_ID >= 4) +
                      sum(RUN2_DEST_ID >= 4) +
                      sum(RUN3_DEST_ID >= 4) +
                      sum(BAT_DEST_ID >= 4))

# new to incorporate half-innings
Runs.inning.visitor <- rbind(data.frame(Runs.inning.visitor,
                                        Half=0),
                             data.frame(Runs.inning.visitor[, 1:2],
                                    Runs=0, Half=1))
Runs.inning.visitor$Half.Inning <- with(Runs.inning.visitor,
                                     2 * INN_CT + Half - 1)
Runs.inning.visitor <- Runs.inning.visitor[with(Runs.inning.visitor,
                          order(GAME_ID, INN_CT, Half.Inning)), ]

# find running scores for each half-inning for home and away

library(plyr)
CRuns.inning.home <- ddply(Runs.inning.home, .(GAME_ID),
                           summarize, Half.Inning=Half.Inning,
                           CRuns=cumsum(Runs))
CRuns.inning.visitor <- ddply(Runs.inning.visitor, .(GAME_ID),
                           summarize, Half.Inning=Half.Inning,
                           CRuns=cumsum(Runs))
CRuns.inning <- merge(CRuns.inning.home, CRuns.inning.visitor,
                      by=c("GAME_ID", "Half.Inning"))
names(CRuns.inning) <- c("GAME_ID", "Half.Inning",
                         "Home.Runs", "Visitor.Runs")
CRuns.inning <- CRuns.inning[with(CRuns.inning,
                              order(GAME_ID, Half.Inning)), ]
detach("package:plyr", unload=TRUE)

# merge the inning and game data frames

Final <- merge(Runs.game, CRuns.inning,
               by="GAME_ID")
names(Final) <- c("GAME_ID", "Final.Home", "Final.Visitor",
                  "Half.Inning",
                  "Home", "Visitor")

# add difference and game result variables

Final$Difference <- with(Final, Home - Visitor)
Final$Result <- with(Final,
                     ifelse(Final.Home > Final.Visitor, 1, 0))

fit.model <- function(inning)
 coef(glm(Result ~ Difference,
         data=filter(Final, Half.Inning==inning), family=binomial))

p.home.win <- mean(Final$Result)
S0 <- data.frame(Half.Inning=0,
                 Beta0=log(p.home.win / (1 - p.home.win)),
                 Beta1=NA)
S <- sapply(1:16, fit.model)
S1 <- rbind(S0,
    data.frame(Half.Inning=1:16, Beta0=S[1, ], Beta1=S[2, ]))
S1$Inning <- c(0, rep(1:8, each=2))
S1$Half <- ifelse(S1$Half.Inning / 2 == floor(S1$Half.Inning / 2),
                 "Bottom", "Top")

# fill in additional rows for innings up to 25

S2 <- S1[16:17, ]
S2$Inning <- 9; S2$Half.Inning <- 17:18
for (inning in 10:25){
  S3 <- S1[16:17, ]; S3$Inning <- inning
  S3$Half.Inning <- c(2 * inning - 1, 2 * inning)
  S2 <- rbind(S2, S3)
}
S1 <- rbind(S1, S2)
S1

}

## R markdown file
---
title: "WPA work"
author: "Jim Albert"
date: "January 23, 2015"
output: html_document
---

First source three functions:

* prob.win.game2 computes the win probabilities after each inning
* compute.runs.expectancy computes the run values
* compute.win.probs computes the win probabilities after each play

These functions also use the functions download.and.read and parse.retrosheet2.pbp.R -- these two functions download and read the Retrosheet play-by-play data.

Should have all five functions in current working directory.  Also working directory should have a folder "download" which contains two subfolders "zipped" and "unzipped".

```{r}
source("prob.win.game2.R")
source("compute.runs.expectancy.R")
source("compute.win.probs.R")
```

Compute the win probabilities of home team after each half inning.

```{r}
S <- prob.win.game2(2014)
```
Outline of WPA calculations:

1.  get runs expectancy matrix
2.  want P(win | current state)
3.  this is approximated by
    P(win | L = current.home.lead +- expected.runs)
  = invlogit(beta0 + beta1 * L)
where (beta0, beta1) obtained from logistic fit at end of that
half-inning


Compute the run values for all plays from data from the 2014 season:

```{r}
setwd("~/Dropbox/2014 WORK/BLOG Baseball R/31 winprobnew/download.folder/unzipped")
d <- compute.runs.expectancy(2014)
setwd("~/Dropbox/2014 WORK/BLOG Baseball R/31 winprobnew")
```

For each play, compute the probability of home win before the play, after the play, and the difference in win probabilities.

```{r}
d <- compute.win.probs(d, S)
```

We graph the win probs for a specific game in 2014.  Write a function to display probabilities for a game of interest.

```{r}
graph.game <- function(d, game.id){
  require(ggplot2)
  d1 <- subset(d, substr(HALF.INNING, 1, 12) == game.id)
  d1$Play <- 1:dim(d1)[1]
  yr <- substr(d1[1, "HALF.INNING"], 4, 7)
  mo <- substr(d1[1, "HALF.INNING"], 8, 9)
  day <- substr(d1[1, "HALF.INNING"], 10, 11)
  print(ggplot(d1, aes(Play, P.NEW)) + geom_line() +
    ylim(0, 1) + geom_hline(yintercept=0.5, color="red") +
    ylab("Probability Home Team Wins") +
    labs(title = paste(d1[1, "AWAY_TEAM_ID"], "AT",
                     substr(d1[1, "HALF.INNING"], 1, 3),
                     "--", mo, "/", day, "/", yr )))
  d1
}
```
Display probabilities of the Philadelphia-Texas game on March 31, 2014.

```{r}
d.game <- graph.game(d, "TBA201404180")
```
	compute.win.probs <- function(d, S){
	# adds variables P.OLD, P.NEW, and WPA
	# to retrosheet data with run expectancies
	invlogit <- function(x) exp(x) / (1 + exp(x))

	half.inning.row <- 2 * d$INN_CT + d$BAT_HOME_ID
	runs0 <- ifelse(d$BAT_HOME_ID == 1,
	d$HOME_SCORE_CT - d$AWAY_SCORE_CT + d$RUNS.STATE,
	d$HOME_SCORE_CT - d$AWAY_SCORE_CT - d$RUNS.STATE)
	d$P.OLD <- invlogit(S[half.inning.row, "Beta0"] +
	S[half.inning.row, "Beta1"] * runs0)
	runs1 <- ifelse(d$BAT_HOME_ID == 1,
	d$HOME_SCORE_CT - d$AWAY_SCORE_CT +
	d$RUNS.NEW.STATE + d$RUNS.SCORED,
	d$HOME_SCORE_CT - d$AWAY_SCORE_CT -
	d$RUNS.NEW.STATE - d$RUNS.SCORED)
	d$P.NEW <- invlogit(S[half.inning.row, "Beta0"] +
	S[half.inning.row, "Beta1"] * runs1)

	d$WPA <- with(d, P.NEW - P.OLD)
	d}
	prob.win.game2 <- function(season){

	# for each half-inning of game through 8th inning
	# outputs coefficients of logistic fit to
	# prob home team wins given home team leads by x runs
	# uses data from season

	# revised to handle half innings

	source("download.and.read.R")

	d <- download.and.read(season)

	# create Runs.game data frame

	library(dplyr)
	Runs.game.home <- summarize(group_by(filter(d$pbp, BAT_HOME_ID==1),
	GAME_ID),
	Home.Runs = sum(RUN1_DEST_ID >= 4) +
	sum(RUN2_DEST_ID >= 4) +
	sum(RUN3_DEST_ID >= 4) +
	sum(BAT_DEST_ID >= 4) )
	Runs.game.visitor <- summarize(group_by(filter(d$pbp, BAT_HOME_ID==0),
	GAME_ID),
	Home.Runs = sum(RUN1_DEST_ID >= 4) +
	sum(RUN2_DEST_ID >= 4) +
	sum(RUN3_DEST_ID >= 4) +
	sum(BAT_DEST_ID >= 4) )
	Runs.game <- merge(Runs.game.home, Runs.game.visitor,
	by="GAME_ID")
	names(Runs.game) <- c("GAME_ID", "Home.Runs", "Visitor.Runs")

	# compute runs scored for each inning for home and away teams

	library(dplyr)
	Runs.inning.home <- summarize(group_by(filter(d$pbp,
	BAT_HOME_ID==1), GAME_ID, INN_CT),
	Runs = sum(RUN1_DEST_ID >= 4) +
	sum(RUN2_DEST_ID >= 4) +
	sum(RUN3_DEST_ID >= 4) +
	sum(BAT_DEST_ID >= 4))
	# new to incorporate half-innings
	Runs.inning.home <- rbind(data.frame(Runs.inning.home[, 1:2],
	Runs=0, Half=0),
	data.frame(Runs.inning.home,
	Half=1))
	Runs.inning.home$Half.Inning <- with(Runs.inning.home,
	2 * INN_CT + Half - 1)
	Runs.inning.home <- Runs.inning.home[with(Runs.inning.home,
	order(GAME_ID, INN_CT, Half.Inning)), ]

	Runs.inning.visitor <- summarize(group_by(filter(d$pbp,
	BAT_HOME_ID==0), GAME_ID, INN_CT),
	Runs = sum(RUN1_DEST_ID >= 4) +
	sum(RUN2_DEST_ID >= 4) +
	sum(RUN3_DEST_ID >= 4) +
	sum(BAT_DEST_ID >= 4))

	# new to incorporate half-innings
	Runs.inning.visitor <- rbind(data.frame(Runs.inning.visitor,
	Half=0),
	data.frame(Runs.inning.visitor[, 1:2],
	Runs=0, Half=1))
	Runs.inning.visitor$Half.Inning <- with(Runs.inning.visitor,
	2 * INN_CT + Half - 1)
	Runs.inning.visitor <- Runs.inning.visitor[with(Runs.inning.visitor,
	order(GAME_ID, INN_CT, Half.Inning)), ]

	# find running scores for each half-inning for home and away

	library(plyr)
	CRuns.inning.home <- ddply(Runs.inning.home, .(GAME_ID),
	summarize, Half.Inning=Half.Inning,
	CRuns=cumsum(Runs))
	CRuns.inning.visitor <- ddply(Runs.inning.visitor, .(GAME_ID),
	summarize, Half.Inning=Half.Inning,
	CRuns=cumsum(Runs))
	CRuns.inning <- merge(CRuns.inning.home, CRuns.inning.visitor,
	by=c("GAME_ID", "Half.Inning"))
	names(CRuns.inning) <- c("GAME_ID", "Half.Inning",
	"Home.Runs", "Visitor.Runs")
	CRuns.inning <- CRuns.inning[with(CRuns.inning,
	order(GAME_ID, Half.Inning)), ]
	detach("package:plyr", unload=TRUE)

	# merge the inning and game data frames

	Final <- merge(Runs.game, CRuns.inning,
	by="GAME_ID")
	names(Final) <- c("GAME_ID", "Final.Home", "Final.Visitor",
	"Half.Inning",
	"Home", "Visitor")

	# add difference and game result variables

	Final$Difference <- with(Final, Home - Visitor)
	Final$Result <- with(Final,
	ifelse(Final.Home > Final.Visitor, 1, 0))

	fit.model <- function(inning)
	coef(glm(Result ~ Difference,
	data=filter(Final, Half.Inning==inning), family=binomial))

	p.home.win <- mean(Final$Result)
	S0 <- data.frame(Half.Inning=0,
	Beta0=log(p.home.win / (1 - p.home.win)),
	Beta1=NA)
	S <- sapply(1:16, fit.model)
	S1 <- rbind(S0,
	data.frame(Half.Inning=1:16, Beta0=S[1, ], Beta1=S[2, ]))
	S1$Inning <- c(0, rep(1:8, each=2))
	S1$Half <- ifelse(S1$Half.Inning / 2 == floor(S1$Half.Inning / 2),
	"Bottom", "Top")

	# fill in additional rows for innings up to 25

	S2 <- S1[16:17, ]
	S2$Inning <- 9; S2$Half.Inning <- 17:18
	for (inning in 10:25){
	S3 <- S1[16:17, ]; S3$Inning <- inning
	S3$Half.Inning <- c(2 * inning - 1, 2 * inning)
	S2 <- rbind(S2, S3)
	}
	S1 <- rbind(S1, S2)
	S1

	}
	---
	title: "WPA work"
	author: "Jim Albert"
	date: "January 23, 2015"
	output: html_document
	---

	First source three functions:

	* prob.win.game2 computes the win probabilities after each inning
	* compute.runs.expectancy computes the run values
	* compute.win.probs computes the win probabilities after each play

	These functions also use the functions download.and.read and parse.retrosheet2.pbp.R -- these two functions download and read the Retrosheet play-by-play data.

	Should have all five functions in current working directory. Also working directory should have a folder "download" which contains two subfolders "zipped" and "unzipped".

	```{r}
	source("prob.win.game2.R")
	source("compute.runs.expectancy.R")
	source("compute.win.probs.R")
	```

	Compute the win probabilities of home team after each half inning.

	```{r}
	S <- prob.win.game2(2014)
	```
	Outline of WPA calculations:

	1. get runs expectancy matrix
	2. want P(win \| current state)
	3. this is approximated by
	P(win \| L = current.home.lead +- expected.runs)
	= invlogit(beta0 + beta1 * L)
	where (beta0, beta1) obtained from logistic fit at end of that
	half-inning


	Compute the run values for all plays from data from the 2014 season:

	```{r}
	setwd("~/Dropbox/2014 WORK/BLOG Baseball R/31 winprobnew/download.folder/unzipped")
	d <- compute.runs.expectancy(2014)
	setwd("~/Dropbox/2014 WORK/BLOG Baseball R/31 winprobnew")
	```

	For each play, compute the probability of home win before the play, after the play, and the difference in win probabilities.

	```{r}
	d <- compute.win.probs(d, S)
	```

	We graph the win probs for a specific game in 2014. Write a function to display probabilities for a game of interest.

	```{r}
	graph.game <- function(d, game.id){
	require(ggplot2)
	d1 <- subset(d, substr(HALF.INNING, 1, 12) == game.id)
	d1$Play <- 1:dim(d1)[1]
	yr <- substr(d1[1, "HALF.INNING"], 4, 7)
	mo <- substr(d1[1, "HALF.INNING"], 8, 9)
	day <- substr(d1[1, "HALF.INNING"], 10, 11)
	print(ggplot(d1, aes(Play, P.NEW)) + geom_line() +
	ylim(0, 1) + geom_hline(yintercept=0.5, color="red") +
	ylab("Probability Home Team Wins") +
	labs(title = paste(d1[1, "AWAY_TEAM_ID"], "AT",
	substr(d1[1, "HALF.INNING"], 1, 3),
	"--", mo, "/", day, "/", yr )))
	d1
	}
	```
	Display probabilities of the Philadelphia-Texas game on March 31, 2014.

	```{r}
	d.game <- graph.game(d, "TBA201404180")
	```