bayesball/thecountwork.R

## thecountwork.R
# I have the data files statcast2017.csv and UmpAssign08to17.csv
# in the current working directory

# load in packages that I need for this work

library(ggrepel)
library(tidyverse)
library(gridExtra)

# read in the data

sc <- read_csv("statcast2017.csv")
ump <- read_csv("UmpAssign08to17.csv")

# data wrangling

sc %>% mutate(Strike = ifelse(description ==
                    "called_strike", 1, 0),
              Ball = ifelse(description %in%
                c("ball", "blocked_ball"), 1, 0),
              Count = paste(balls, strikes, sep="-")) ->
  sc

sc %>% filter(Ball == 1 | Strike == 1)  -> sc_called

inner_join(sc_called, ump) -> sc_called

# function to implement the gam fitting

do_fit <- function(d){
  require(mgcv)
  gam(Strike ~ s(plate_x, plate_z),
      family=binomial,
      data=d)
}

##### function cplot()
##### predict and plot over grid
##### assuming gam fit stored in variable fit

cplot <- function(){
df_p <- expand.grid(plate_x = seq(-1.2, 1.2, length=50),
                    plate_z = seq(1, 4, length=50))
df_p$lp <- predict(fit, df_p)
df_p$Probability <- exp(df_p$lp) / (1 + exp(df_p$lp))

topKzone <- 3.5
botKzone <- 1.6
inKzone <- -0.85
outKzone <- 0.85
kZone <- data.frame(
  x=c(inKzone, inKzone, outKzone, outKzone, inKzone),
  y=c(botKzone, topKzone, topKzone, botKzone, botKzone)
)
ggplot(df_p)  +
  stat_contour(aes(x=plate_x, y=plate_z,
                   z=Probability,
                   color = ..level..),
               breaks=c(.5, .9),
               size=1.5) +
  geom_path(aes(x, y), data=kZone,
            lwd=1, col="red") +
  xlim(-1.5, 1.5) +
  ylim(1.0, 4.0)  +
  theme(
    plot.title = element_text(
      colour = "blue",
      size = 16,
      hjust = 0.5,
      vjust = 0.8,
      angle = 0
    )
  ) + coord_fixed(ratio = 1)
}

##################################

# basic plot

fit <- do_fit(filter(sc_called,
                     stand == "R"))
p1 <- cplot() + ggtitle("Batting Right")

fit <- do_fit(filter(sc_called,
                     stand == "L"))
p2 <- cplot() + ggtitle("Batting Left")
grid.arrange(p1, p2, ncol=2)

# look at number of strikes effect

fit <- do_fit(filter(sc_called,
                     stand == "R",
                     strikes == 0))
p1 <- cplot() + ggtitle("Right, 0 Strikes")

fit <- do_fit(filter(sc_called,
                     stand == "R",
                     strikes == 2))
p2 <- cplot() + ggtitle("Right, 2 Strikes")

fit <- do_fit(filter(sc_called,
                     stand == "L",
                     strikes == 0))

p3 <- cplot() + ggtitle("Left, 0 Strikes")

fit <- do_fit(filter(sc_called,
                     stand == "L",
                     strikes == 2))
p4 <- cplot() + ggtitle("Left, 2 Strikes")

library(gridExtra)
grid.arrange(p1, p2, p3, p4,
             top="True Strike Zones - 2017 Data ")

# find the number of strikes and balls for all umps

sc_called %>% group_by(UmpName) %>%
  summarize(S = sum(Strike),
            B = sum(Ball),
            N = S + B) -> SU

ggplot(SU, aes(N, S / N, label=UmpName)) +
  geom_point() +
  geom_label_repel(data =
            filter(SU, S/N > .333, N > 3000),
             color="red") +
  xlab("Number of Called Pitches") +
  ylab("Proportion of Called Strikes") +
  ggtitle("2017 Umpire Stats") +
  theme(
    plot.title = element_text(
      colour = "blue",
      size = 16,
      hjust = 0.5,
      vjust = 0.8,
      angle = 0
    )
  )

# focus on the top four in calling strikes

topUmp <- filter(SU, S / N > .333, N > 4000)

fit <-do_fit(filter(sc_called,
           UmpName == topUmp$UmpName[1],
           stand == "R"))
p1a <- cplot() +
  ggtitle(paste(topUmp$UmpName[1], "R"))
fit <-do_fit(filter(sc_called,
            UmpName == topUmp$UmpName[1],
             stand == "L"))
p1b <- cplot() +
  ggtitle(paste(topUmp$UmpName[1], "L"))

fit <-do_fit(filter(sc_called,
                UmpName == topUmp$UmpName[2],
                    stand == "R"))
p2a <- cplot() +
  ggtitle(paste(topUmp$UmpName[2], "R"))
fit <-do_fit(filter(sc_called,
              UmpName == topUmp$UmpName[2],
                    stand == "L"))
p2b <- cplot() +
  ggtitle(paste(topUmp$UmpName[2], "L"))

fit <-do_fit(filter(sc_called,
                UmpName == topUmp$UmpName[3],
                    stand == "R"))
p3a <- cplot() +
  ggtitle(paste(topUmp$UmpName[3], "R"))
fit <-do_fit(filter(sc_called,
                UmpName == topUmp$UmpName[3],
                    stand == "L"))
p3b <- cplot() +
  ggtitle(paste(topUmp$UmpName[3], "L"))

fit <-do_fit(filter(sc_called,
              UmpName == topUmp$UmpName[4],
                    stand == "R"))
p4a <- cplot() +
  ggtitle(paste(topUmp$UmpName[4], "R"))
fit <-do_fit(filter(sc_called,
                UmpName == topUmp$UmpName[4],
                    stand == "L"))
p4b <- cplot() +
  ggtitle(paste(topUmp$UmpName[4], "L"))

grid.arrange(p1a, p1b, p2a, p2b, ncol=2)

grid.arrange(p3a, p3b, p4a, p4b, ncol=2)
	# I have the data files statcast2017.csv and UmpAssign08to17.csv
	# in the current working directory

	# load in packages that I need for this work

	library(ggrepel)
	library(tidyverse)
	library(gridExtra)

	# read in the data

	sc <- read_csv("statcast2017.csv")
	ump <- read_csv("UmpAssign08to17.csv")

	# data wrangling

	sc %>% mutate(Strike = ifelse(description ==
	"called_strike", 1, 0),
	Ball = ifelse(description %in%
	c("ball", "blocked_ball"), 1, 0),
	Count = paste(balls, strikes, sep="-")) ->
	sc

	sc %>% filter(Ball == 1 \| Strike == 1) -> sc_called

	inner_join(sc_called, ump) -> sc_called

	# function to implement the gam fitting

	do_fit <- function(d){
	require(mgcv)
	gam(Strike ~ s(plate_x, plate_z),
	family=binomial,
	data=d)
	}

	##### function cplot()
	##### predict and plot over grid
	##### assuming gam fit stored in variable fit

	cplot <- function(){
	df_p <- expand.grid(plate_x = seq(-1.2, 1.2, length=50),
	plate_z = seq(1, 4, length=50))
	df_p$lp <- predict(fit, df_p)
	df_p$Probability <- exp(df_p$lp) / (1 + exp(df_p$lp))

	topKzone <- 3.5
	botKzone <- 1.6
	inKzone <- -0.85
	outKzone <- 0.85
	kZone <- data.frame(
	x=c(inKzone, inKzone, outKzone, outKzone, inKzone),
	y=c(botKzone, topKzone, topKzone, botKzone, botKzone)
	)
	ggplot(df_p) +
	stat_contour(aes(x=plate_x, y=plate_z,
	z=Probability,
	color = ..level..),
	breaks=c(.5, .9),
	size=1.5) +
	geom_path(aes(x, y), data=kZone,
	lwd=1, col="red") +
	xlim(-1.5, 1.5) +
	ylim(1.0, 4.0) +
	theme(
	plot.title = element_text(
	colour = "blue",
	size = 16,
	hjust = 0.5,
	vjust = 0.8,
	angle = 0
	)
	) + coord_fixed(ratio = 1)
	}

	##################################

	# basic plot

	fit <- do_fit(filter(sc_called,
	stand == "R"))
	p1 <- cplot() + ggtitle("Batting Right")

	fit <- do_fit(filter(sc_called,
	stand == "L"))
	p2 <- cplot() + ggtitle("Batting Left")
	grid.arrange(p1, p2, ncol=2)

	# look at number of strikes effect

	fit <- do_fit(filter(sc_called,
	stand == "R",
	strikes == 0))
	p1 <- cplot() + ggtitle("Right, 0 Strikes")

	fit <- do_fit(filter(sc_called,
	stand == "R",
	strikes == 2))
	p2 <- cplot() + ggtitle("Right, 2 Strikes")

	fit <- do_fit(filter(sc_called,
	stand == "L",
	strikes == 0))

	p3 <- cplot() + ggtitle("Left, 0 Strikes")

	fit <- do_fit(filter(sc_called,
	stand == "L",
	strikes == 2))
	p4 <- cplot() + ggtitle("Left, 2 Strikes")

	library(gridExtra)
	grid.arrange(p1, p2, p3, p4,
	top="True Strike Zones - 2017 Data ")

	# find the number of strikes and balls for all umps

	sc_called %>% group_by(UmpName) %>%
	summarize(S = sum(Strike),
	B = sum(Ball),
	N = S + B) -> SU

	ggplot(SU, aes(N, S / N, label=UmpName)) +
	geom_point() +
	geom_label_repel(data =
	filter(SU, S/N > .333, N > 3000),
	color="red") +
	xlab("Number of Called Pitches") +
	ylab("Proportion of Called Strikes") +
	ggtitle("2017 Umpire Stats") +
	theme(
	plot.title = element_text(
	colour = "blue",
	size = 16,
	hjust = 0.5,
	vjust = 0.8,
	angle = 0
	)
	)

	# focus on the top four in calling strikes

	topUmp <- filter(SU, S / N > .333, N > 4000)

	fit <-do_fit(filter(sc_called,
	UmpName == topUmp$UmpName[1],
	stand == "R"))
	p1a <- cplot() +
	ggtitle(paste(topUmp$UmpName[1], "R"))
	fit <-do_fit(filter(sc_called,
	UmpName == topUmp$UmpName[1],
	stand == "L"))
	p1b <- cplot() +
	ggtitle(paste(topUmp$UmpName[1], "L"))

	fit <-do_fit(filter(sc_called,
	UmpName == topUmp$UmpName[2],
	stand == "R"))
	p2a <- cplot() +
	ggtitle(paste(topUmp$UmpName[2], "R"))
	fit <-do_fit(filter(sc_called,
	UmpName == topUmp$UmpName[2],
	stand == "L"))
	p2b <- cplot() +
	ggtitle(paste(topUmp$UmpName[2], "L"))

	fit <-do_fit(filter(sc_called,
	UmpName == topUmp$UmpName[3],
	stand == "R"))
	p3a <- cplot() +
	ggtitle(paste(topUmp$UmpName[3], "R"))
	fit <-do_fit(filter(sc_called,
	UmpName == topUmp$UmpName[3],
	stand == "L"))
	p3b <- cplot() +
	ggtitle(paste(topUmp$UmpName[3], "L"))

	fit <-do_fit(filter(sc_called,
	UmpName == topUmp$UmpName[4],
	stand == "R"))
	p4a <- cplot() +
	ggtitle(paste(topUmp$UmpName[4], "R"))
	fit <-do_fit(filter(sc_called,
	UmpName == topUmp$UmpName[4],
	stand == "L"))
	p4b <- cplot() +
	ggtitle(paste(topUmp$UmpName[4], "L"))

	grid.arrange(p1a, p1b, p2a, p2b, ncol=2)

	grid.arrange(p3a, p3b, p4a, p4b, ncol=2)