Last active
May 31, 2016 02:54
-
-
Save bayesball/5cb5873756d74b7dc432f674035ba1e0 to your computer and use it in GitHub Desktop.
R code to compute transition probability matrix for Markov Chain model for pitch counts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# read in Retrosheet play-by-play data for 2015 season | |
load("~/OneDriveBusiness/Retrosheet/pbp.2015.Rdata") | |
# limit to batting plays | |
d2015 <- subset(d2015, BAT_EVENT_FL==TRUE) | |
# removes all non-pitches from PITCH_SEQ_TX | |
d2015$pseq <- gsub("[.>123N+*]", "", d2015$PITCH_SEQ_TX) | |
# create a b and s sequence | |
d2015$pseq <- gsub("[BIPV]", "b", d2015$pseq) | |
d2015$pseq <- gsub("[CFKLMOQRST]", "s", d2015$pseq) | |
# function one.string will create a list of "before" and "after" | |
# pitch counts given a single value of pseq variable | |
one.string <- function(ex){ | |
# replace s and b with X for strikeouts and walks | |
ex <- gsub("s$", "X", ex) | |
ex <- gsub("b$", "X", ex) | |
# create a vector of individual outcomes | |
ex.v <- unlist(strsplit(ex,"")) | |
# remove last X from vector | |
ex.v <- ex.v[-length(ex.v)] | |
# compute cumulative total of balls and strikes | |
n.balls <- cumsum(ex.v == "b") | |
n.strikes <- pmin(cumsum(ex.v == "s"), 2) | |
# check | |
N <- length(n.balls) | |
if(N > 0){ | |
if(n.balls[N] == 4 | n.strikes[N] == 3){ | |
n.balls <- n.balls[-N] | |
n.strikes <- n.strikes[-N] | |
}} | |
# create pitch count variable | |
S <- paste(n.balls, n.strikes, sep="-") | |
# add a beginning and end outcome | |
S <- c("0-0", S, "X") | |
# before and after counts | |
b.count <- S[1:(length(S) - 1)] | |
e.count <- S[-1] | |
list(b.count, e.count) | |
} | |
# this applies the one.string function to all PA's | |
S <- sapply(d2015$pseq, one.string) | |
# create transition probability matrix in the Markov Chain | |
# for a specific pitcher or hitter | |
create_MC <- function(p, pitcher=TRUE){ | |
require(Lahman) | |
require(dplyr) | |
name <- unlist(strsplit(p, " ")) | |
pid <- filter(Master, nameFirst==name[1], | |
nameLast==name[2])$retroID | |
M <- matrix(0, 13, 13) | |
dimnames(M)[[1]] <- c("0-0", "0-1", "1-0", | |
"0-2", "1-1", "2-0", | |
"1-2", "2-1", "3-0", | |
"2-2", "3-1", "3-2", "X") | |
dimnames(M)[[2]] <- dimnames(M)[[1]] | |
if (pitcher==TRUE) | |
PJ <- (1:(dim(d2015)[1]))[d2015$PIT_ID == pid] else | |
PJ <- (1:(dim(d2015)[1]))[d2015$BAT_ID == pid] | |
for(j in PJ){ | |
start <- S[1, j][[1]] | |
end <- S[2, j][[1]] | |
if(is.na(start)[1]==FALSE){ | |
for (k in 1:length(start)){ | |
M[start[k], end[k]] <- M[start[k], end[k]] + 1 | |
}} | |
} | |
P <- prop.table(M, 1) | |
P[2, 1] <- 0 | |
P[13, ] <- rep(0, 13) | |
P[13, 13] <- 1 | |
P | |
} | |
# function compute_expected appearances computes the expected | |
# number of times in each pitch count given the MC | |
# transition matrix | |
compute_expected_appearances <- function(P){ | |
Q <- P[-13, -13] | |
E <- solve(diag(12) - Q)[1, ] | |
data.frame(Count=names(E)[-1], | |
Expected=E[-1]) | |
} | |
# illustrate computation of Markov Chain and expected vector | |
# for Clayton Kershaw | |
kershaw <- create_MC("Clayton Kershaw") | |
E <- compute_expected_appearances(kershaw) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment