Skip to content

Instantly share code, notes, and snippets.

@bayesball
Last active December 23, 2016 14:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/a403beaae2aa81698482831c5a395a01 to your computer and use it in GitHub Desktop.
Save bayesball/a403beaae2aa81698482831c5a395a01 to your computer and use it in GitHub Desktop.
R work for stolen base attempt study using 2016 Retrosheet data
# stolen base study
# look at SB and SB Attempts
# downloading Retrosheet data and computing runs values
# using two helper functions
# see https://baseballwithr.wordpress.com/2014/02/10/downloading-retrosheet-data-and-runs-expectancy/
# for specific instructions on downloading Retrosheet data
library(devtools)
devtools::source_gist("8892981",
filename = "parse.retrosheet2.pbp.R")
devtools::source_gist("8892999",
filename = "compute.runs.expectancy.R")
parse.retrosheet2.pbp(2016)
setwd("download.folder/unzipped")
d2016 <- compute.runs.expectancy(2016)
save(d2016, file="d2016.Rdata")
# read in Retrosheet data for the 2016 season
library(dplyr)
load("download.folder/unzipped/d2016.Rdata")
# define the bat team id variable and convert
# EVENT_TX to character type
d2016 <- mutate(d2016,
BAT_TEAM_ID=ifelse(BAT_HOME_ID==0,
as.character(AWAY_TEAM_ID),
substr(HALF.INNING, 1, 3)),
EVENT_TX=as.character(EVENT_TX))
# SB is a data frame with only the stolen base attempts
SB <- filter(d2016, grepl("SB", EVENT_TX) == TRUE |
grepl("CS", EVENT_TX) == TRUE)
# find some overall summaries to check (with BR) that we
# have captured all of the SB attempts for 2016 season
summarize(SB,
SB2 = sum(grepl("SB2", EVENT_TX) == TRUE),
SB3 = sum(grepl("SB3", EVENT_TX) == TRUE),
SBH = sum(grepl("SBH", EVENT_TX) == TRUE),
CS2 = sum(grepl("CS2", EVENT_TX) == TRUE),
CS3 = sum(grepl("CS3", EVENT_TX) == TRUE),
CSH = sum(grepl("CSH", EVENT_TX) == TRUE))
# count the number of SB and CS of each possible base
library(stringi)
SB <- mutate(SB,
SB2 = stri_count_regex(EVENT_TX, "SB2"),
CS2 = stri_count_regex(EVENT_TX, "CS2"),
SB3 = stri_count_regex(EVENT_TX, "SB3"),
CS3 = stri_count_regex(EVENT_TX, "CS3"),
SBH = stri_count_regex(EVENT_TX, "SBH"),
CSH = stri_count_regex(EVENT_TX, "CSH"),
SB = SB2 + SB3 + SBH,
CS = CS2 + CS3 + CSH)
# when (states?) is there an attempted steal of 2nd base?
# S1 is data frame for the 2nd base attempts
# compute the number of attempts and percentage successful
# construct a graph of this data
S1 <- summarize(group_by(filter(SB, SB2 + CS2 > 0), STATE),
Yes=sum(SB2), No=sum(CS2),
Number_Attempts = Yes + No,
Pct_Yes=round(100 * Yes / (Yes + No)))
library(ggplot2)
TH <- theme(plot.title = element_text(colour = "blue",
size = 18,
hjust = 0.5, vjust = 0.8, angle = 0))
ggplot(S1, aes(STATE, Pct_Yes, size=Number_Attempts)) +
geom_point() + ylim(50, 100) +
ggtitle("Success in Stealing 2nd Base") + TH +
ylab("Success Rate")
# try similar graph for attempted steals of 3rd
S2 <- summarize(group_by(filter(SB, SB3 + CS3 > 0), STATE),
Yes=sum(SB3), No=sum(CS3),
Number_Attempts = Yes + No,
Pct_Yes=round(100 * Yes / (Yes + No)))
ggplot(S2, aes(STATE, Pct_Yes, size=Number_Attempts)) +
geom_point() + ylim(50, 100) +
ggtitle("Success in Stealing 3rd Base") + TH +
ylab("Success Rate")
# Number of attempts and success rate by team
SB_team <- summarize(group_by(SB, BAT_TEAM_ID),
SB = sum(SB),
CS = sum(CS),
Attempts = SB + CS,
Success_Rate = 100 * SB / Attempts)
SB_team$BAT_TEAM_ID <- with(SB_team,
factor(BAT_TEAM_ID,
levels=BAT_TEAM_ID[order(Attempts)]))
# Here is a bar graph of the number of attempts by team
# (did not show this in the blog post)
ggplot(SB_team, aes(BAT_TEAM_ID, Attempts)) +
geom_bar(stat="identity") +
coord_flip() +
ggtitle("2016 Stolen Base Attempts") + TH
# Scatterplot of attempts and success rate
ggplot(SB_team, aes(Attempts, Success_Rate,
label=BAT_TEAM_ID)) +
geom_smooth(method="lm") +
ggtitle("Stolen Base Attempts and Success Rate") + TH +
geom_text(check_overlap = TRUE)
## value of stolen bases
# only look at cases where there is a single stealing attempt
# and there is no plate appearance event
# Type variable indicates what the base of the SB attempt
SB_1 <- filter(SB, SB + CS == 1,
BAT_EVENT_FL == FALSE)
SB_1 <- mutate(SB_1,
Type=ifelse(SB2 + CS2 == 1, "2nd Base",
ifelse(SB3 + CS3 == 1, "3rd Base",
"Home")))
# Construct a violin plot of the run values by Type
ggplot(SB_1, aes(Type, RUNS.VALUE, color=factor(SB))) +
geom_violin() +
coord_flip() +
ggtitle("Runs Value of Different Stealing Attempts") +
TH +
geom_hline(yintercept = 0, color="blue")
# Displays some summaries of the run values
SM <- summarize(group_by(SB_1, Type),
N=n(),
Mean=mean(RUNS.VALUE),
Median=median(RUNS.VALUE),
QS=diff(quantile(RUNS.VALUE, c(.25, .75))))
SM[, -1] <- round(SM[, -1],3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment