Skip to content

Instantly share code, notes, and snippets.

@bayesball
Last active May 31, 2016 02:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/5cb5873756d74b7dc432f674035ba1e0 to your computer and use it in GitHub Desktop.
Save bayesball/5cb5873756d74b7dc432f674035ba1e0 to your computer and use it in GitHub Desktop.
R code to compute transition probability matrix for Markov Chain model for pitch counts
# read in Retrosheet play-by-play data for 2015 season
load("~/OneDriveBusiness/Retrosheet/pbp.2015.Rdata")
# limit to batting plays
d2015 <- subset(d2015, BAT_EVENT_FL==TRUE)
# removes all non-pitches from PITCH_SEQ_TX
d2015$pseq <- gsub("[.>123N+*]", "", d2015$PITCH_SEQ_TX)
# create a b and s sequence
d2015$pseq <- gsub("[BIPV]", "b", d2015$pseq)
d2015$pseq <- gsub("[CFKLMOQRST]", "s", d2015$pseq)
# function one.string will create a list of "before" and "after"
# pitch counts given a single value of pseq variable
one.string <- function(ex){
# replace s and b with X for strikeouts and walks
ex <- gsub("s$", "X", ex)
ex <- gsub("b$", "X", ex)
# create a vector of individual outcomes
ex.v <- unlist(strsplit(ex,""))
# remove last X from vector
ex.v <- ex.v[-length(ex.v)]
# compute cumulative total of balls and strikes
n.balls <- cumsum(ex.v == "b")
n.strikes <- pmin(cumsum(ex.v == "s"), 2)
# check
N <- length(n.balls)
if(N > 0){
if(n.balls[N] == 4 | n.strikes[N] == 3){
n.balls <- n.balls[-N]
n.strikes <- n.strikes[-N]
}}
# create pitch count variable
S <- paste(n.balls, n.strikes, sep="-")
# add a beginning and end outcome
S <- c("0-0", S, "X")
# before and after counts
b.count <- S[1:(length(S) - 1)]
e.count <- S[-1]
list(b.count, e.count)
}
# this applies the one.string function to all PA's
S <- sapply(d2015$pseq, one.string)
# create transition probability matrix in the Markov Chain
# for a specific pitcher or hitter
create_MC <- function(p, pitcher=TRUE){
require(Lahman)
require(dplyr)
name <- unlist(strsplit(p, " "))
pid <- filter(Master, nameFirst==name[1],
nameLast==name[2])$retroID
M <- matrix(0, 13, 13)
dimnames(M)[[1]] <- c("0-0", "0-1", "1-0",
"0-2", "1-1", "2-0",
"1-2", "2-1", "3-0",
"2-2", "3-1", "3-2", "X")
dimnames(M)[[2]] <- dimnames(M)[[1]]
if (pitcher==TRUE)
PJ <- (1:(dim(d2015)[1]))[d2015$PIT_ID == pid] else
PJ <- (1:(dim(d2015)[1]))[d2015$BAT_ID == pid]
for(j in PJ){
start <- S[1, j][[1]]
end <- S[2, j][[1]]
if(is.na(start)[1]==FALSE){
for (k in 1:length(start)){
M[start[k], end[k]] <- M[start[k], end[k]] + 1
}}
}
P <- prop.table(M, 1)
P[2, 1] <- 0
P[13, ] <- rep(0, 13)
P[13, 13] <- 1
P
}
# function compute_expected appearances computes the expected
# number of times in each pitch count given the MC
# transition matrix
compute_expected_appearances <- function(P){
Q <- P[-13, -13]
E <- solve(diag(12) - Q)[1, ]
data.frame(Count=names(E)[-1],
Expected=E[-1])
}
# illustrate computation of Markov Chain and expected vector
# for Clayton Kershaw
kershaw <- create_MC("Clayton Kershaw")
E <- compute_expected_appearances(kershaw)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment