bayesball/markov_chain_pitch_count.R

## markov_chain_pitch_count.R
# read in Retrosheet play-by-play data for 2015 season
load("~/OneDriveBusiness/Retrosheet/pbp.2015.Rdata")

# limit to batting plays
d2015 <- subset(d2015, BAT_EVENT_FL==TRUE)

# removes all non-pitches from PITCH_SEQ_TX
d2015$pseq <- gsub("[.>123N+*]", "", d2015$PITCH_SEQ_TX)

# create a b and s sequence
d2015$pseq <- gsub("[BIPV]", "b", d2015$pseq)
d2015$pseq <- gsub("[CFKLMOQRST]", "s", d2015$pseq)

# function one.string will create a list of "before" and "after"
# pitch counts given a single value of pseq variable

one.string <- function(ex){
  # replace s and b with X for strikeouts and walks
  ex <- gsub("s$", "X", ex)
  ex <- gsub("b$", "X", ex)
  # create a vector of individual outcomes
  ex.v <- unlist(strsplit(ex,""))
  # remove last X from vector
  ex.v <- ex.v[-length(ex.v)]
  # compute cumulative total of balls and strikes
  n.balls <- cumsum(ex.v == "b")
  n.strikes <- pmin(cumsum(ex.v == "s"), 2)
  # check
  N <- length(n.balls)
  if(N > 0){
  if(n.balls[N] == 4 | n.strikes[N] == 3){
     n.balls <- n.balls[-N]
     n.strikes <- n.strikes[-N]
  }}
  # create pitch count variable
  S <- paste(n.balls, n.strikes, sep="-")
  # add a beginning and end outcome
  S <- c("0-0", S, "X")
  # before and after counts
  b.count <- S[1:(length(S) - 1)]
  e.count <- S[-1]
  list(b.count, e.count)
}

# this applies the one.string function to all PA's
S <- sapply(d2015$pseq, one.string)

# create transition probability matrix in the Markov Chain
# for a specific pitcher or hitter
create_MC <- function(p, pitcher=TRUE){
  require(Lahman)
  require(dplyr)
  name <- unlist(strsplit(p, " "))
  pid <- filter(Master, nameFirst==name[1],
                 nameLast==name[2])$retroID
  M <- matrix(0, 13, 13)
  dimnames(M)[[1]] <- c("0-0", "0-1", "1-0",
                      "0-2", "1-1", "2-0",
                      "1-2", "2-1", "3-0",
                      "2-2", "3-1", "3-2", "X")
  dimnames(M)[[2]] <- dimnames(M)[[1]]
  if (pitcher==TRUE)
      PJ <- (1:(dim(d2015)[1]))[d2015$PIT_ID == pid] else
      PJ <- (1:(dim(d2015)[1]))[d2015$BAT_ID == pid]
  for(j in PJ){
      start <- S[1, j][[1]]
      end <- S[2, j][[1]]
        if(is.na(start)[1]==FALSE){
          for (k in 1:length(start)){
            M[start[k], end[k]] <- M[start[k], end[k]] + 1
          }}
  }
  P <- prop.table(M, 1)
  P[2, 1] <- 0
  P[13, ] <- rep(0, 13)
  P[13, 13] <- 1
P
}

# function compute_expected appearances computes the expected
# number of times in each pitch count given the MC
# transition matrix
compute_expected_appearances <- function(P){
  Q <- P[-13, -13]
  E <- solve(diag(12) - Q)[1, ]
  data.frame(Count=names(E)[-1],
             Expected=E[-1])
}

# illustrate computation of Markov Chain and expected vector
# for Clayton Kershaw
kershaw <- create_MC("Clayton Kershaw")
E <- compute_expected_appearances(kershaw)
	# read in Retrosheet play-by-play data for 2015 season
	load("~/OneDriveBusiness/Retrosheet/pbp.2015.Rdata")

	# limit to batting plays
	d2015 <- subset(d2015, BAT_EVENT_FL==TRUE)

	# removes all non-pitches from PITCH_SEQ_TX
	d2015$pseq <- gsub("[.>123N+*]", "", d2015$PITCH_SEQ_TX)

	# create a b and s sequence
	d2015$pseq <- gsub("[BIPV]", "b", d2015$pseq)
	d2015$pseq <- gsub("[CFKLMOQRST]", "s", d2015$pseq)

	# function one.string will create a list of "before" and "after"
	# pitch counts given a single value of pseq variable

	one.string <- function(ex){
	# replace s and b with X for strikeouts and walks
	ex <- gsub("s$", "X", ex)
	ex <- gsub("b$", "X", ex)
	# create a vector of individual outcomes
	ex.v <- unlist(strsplit(ex,""))
	# remove last X from vector
	ex.v <- ex.v[-length(ex.v)]
	# compute cumulative total of balls and strikes
	n.balls <- cumsum(ex.v == "b")
	n.strikes <- pmin(cumsum(ex.v == "s"), 2)
	# check
	N <- length(n.balls)
	if(N > 0){
	if(n.balls[N] == 4 \| n.strikes[N] == 3){
	n.balls <- n.balls[-N]
	n.strikes <- n.strikes[-N]
	}}
	# create pitch count variable
	S <- paste(n.balls, n.strikes, sep="-")
	# add a beginning and end outcome
	S <- c("0-0", S, "X")
	# before and after counts
	b.count <- S[1:(length(S) - 1)]
	e.count <- S[-1]
	list(b.count, e.count)
	}

	# this applies the one.string function to all PA's
	S <- sapply(d2015$pseq, one.string)

	# create transition probability matrix in the Markov Chain
	# for a specific pitcher or hitter
	create_MC <- function(p, pitcher=TRUE){
	require(Lahman)
	require(dplyr)
	name <- unlist(strsplit(p, " "))
	pid <- filter(Master, nameFirst==name[1],
	nameLast==name[2])$retroID
	M <- matrix(0, 13, 13)
	dimnames(M)[[1]] <- c("0-0", "0-1", "1-0",
	"0-2", "1-1", "2-0",
	"1-2", "2-1", "3-0",
	"2-2", "3-1", "3-2", "X")
	dimnames(M)[[2]] <- dimnames(M)[[1]]
	if (pitcher==TRUE)
	PJ <- (1:(dim(d2015)[1]))[d2015$PIT_ID == pid] else
	PJ <- (1:(dim(d2015)[1]))[d2015$BAT_ID == pid]
	for(j in PJ){
	start <- S[1, j][[1]]
	end <- S[2, j][[1]]
	if(is.na(start)[1]==FALSE){
	for (k in 1:length(start)){
	M[start[k], end[k]] <- M[start[k], end[k]] + 1
	}}
	}
	P <- prop.table(M, 1)
	P[2, 1] <- 0
	P[13, ] <- rep(0, 13)
	P[13, 13] <- 1
	P
	}

	# function compute_expected appearances computes the expected
	# number of times in each pitch count given the MC
	# transition matrix
	compute_expected_appearances <- function(P){
	Q <- P[-13, -13]
	E <- solve(diag(12) - Q)[1, ]
	data.frame(Count=names(E)[-1],
	Expected=E[-1])
	}

	# illustrate computation of Markov Chain and expected vector
	# for Clayton Kershaw
	kershaw <- create_MC("Clayton Kershaw")
	E <- compute_expected_appearances(kershaw)