Last active
December 23, 2016 14:58
-
-
Save bayesball/a403beaae2aa81698482831c5a395a01 to your computer and use it in GitHub Desktop.
R work for stolen base attempt study using 2016 Retrosheet data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# stolen base study | |
# look at SB and SB Attempts | |
# downloading Retrosheet data and computing runs values | |
# using two helper functions | |
# see https://baseballwithr.wordpress.com/2014/02/10/downloading-retrosheet-data-and-runs-expectancy/ | |
# for specific instructions on downloading Retrosheet data | |
library(devtools) | |
devtools::source_gist("8892981", | |
filename = "parse.retrosheet2.pbp.R") | |
devtools::source_gist("8892999", | |
filename = "compute.runs.expectancy.R") | |
parse.retrosheet2.pbp(2016) | |
setwd("download.folder/unzipped") | |
d2016 <- compute.runs.expectancy(2016) | |
save(d2016, file="d2016.Rdata") | |
# read in Retrosheet data for the 2016 season | |
library(dplyr) | |
load("download.folder/unzipped/d2016.Rdata") | |
# define the bat team id variable and convert | |
# EVENT_TX to character type | |
d2016 <- mutate(d2016, | |
BAT_TEAM_ID=ifelse(BAT_HOME_ID==0, | |
as.character(AWAY_TEAM_ID), | |
substr(HALF.INNING, 1, 3)), | |
EVENT_TX=as.character(EVENT_TX)) | |
# SB is a data frame with only the stolen base attempts | |
SB <- filter(d2016, grepl("SB", EVENT_TX) == TRUE | | |
grepl("CS", EVENT_TX) == TRUE) | |
# find some overall summaries to check (with BR) that we | |
# have captured all of the SB attempts for 2016 season | |
summarize(SB, | |
SB2 = sum(grepl("SB2", EVENT_TX) == TRUE), | |
SB3 = sum(grepl("SB3", EVENT_TX) == TRUE), | |
SBH = sum(grepl("SBH", EVENT_TX) == TRUE), | |
CS2 = sum(grepl("CS2", EVENT_TX) == TRUE), | |
CS3 = sum(grepl("CS3", EVENT_TX) == TRUE), | |
CSH = sum(grepl("CSH", EVENT_TX) == TRUE)) | |
# count the number of SB and CS of each possible base | |
library(stringi) | |
SB <- mutate(SB, | |
SB2 = stri_count_regex(EVENT_TX, "SB2"), | |
CS2 = stri_count_regex(EVENT_TX, "CS2"), | |
SB3 = stri_count_regex(EVENT_TX, "SB3"), | |
CS3 = stri_count_regex(EVENT_TX, "CS3"), | |
SBH = stri_count_regex(EVENT_TX, "SBH"), | |
CSH = stri_count_regex(EVENT_TX, "CSH"), | |
SB = SB2 + SB3 + SBH, | |
CS = CS2 + CS3 + CSH) | |
# when (states?) is there an attempted steal of 2nd base? | |
# S1 is data frame for the 2nd base attempts | |
# compute the number of attempts and percentage successful | |
# construct a graph of this data | |
S1 <- summarize(group_by(filter(SB, SB2 + CS2 > 0), STATE), | |
Yes=sum(SB2), No=sum(CS2), | |
Number_Attempts = Yes + No, | |
Pct_Yes=round(100 * Yes / (Yes + No))) | |
library(ggplot2) | |
TH <- theme(plot.title = element_text(colour = "blue", | |
size = 18, | |
hjust = 0.5, vjust = 0.8, angle = 0)) | |
ggplot(S1, aes(STATE, Pct_Yes, size=Number_Attempts)) + | |
geom_point() + ylim(50, 100) + | |
ggtitle("Success in Stealing 2nd Base") + TH + | |
ylab("Success Rate") | |
# try similar graph for attempted steals of 3rd | |
S2 <- summarize(group_by(filter(SB, SB3 + CS3 > 0), STATE), | |
Yes=sum(SB3), No=sum(CS3), | |
Number_Attempts = Yes + No, | |
Pct_Yes=round(100 * Yes / (Yes + No))) | |
ggplot(S2, aes(STATE, Pct_Yes, size=Number_Attempts)) + | |
geom_point() + ylim(50, 100) + | |
ggtitle("Success in Stealing 3rd Base") + TH + | |
ylab("Success Rate") | |
# Number of attempts and success rate by team | |
SB_team <- summarize(group_by(SB, BAT_TEAM_ID), | |
SB = sum(SB), | |
CS = sum(CS), | |
Attempts = SB + CS, | |
Success_Rate = 100 * SB / Attempts) | |
SB_team$BAT_TEAM_ID <- with(SB_team, | |
factor(BAT_TEAM_ID, | |
levels=BAT_TEAM_ID[order(Attempts)])) | |
# Here is a bar graph of the number of attempts by team | |
# (did not show this in the blog post) | |
ggplot(SB_team, aes(BAT_TEAM_ID, Attempts)) + | |
geom_bar(stat="identity") + | |
coord_flip() + | |
ggtitle("2016 Stolen Base Attempts") + TH | |
# Scatterplot of attempts and success rate | |
ggplot(SB_team, aes(Attempts, Success_Rate, | |
label=BAT_TEAM_ID)) + | |
geom_smooth(method="lm") + | |
ggtitle("Stolen Base Attempts and Success Rate") + TH + | |
geom_text(check_overlap = TRUE) | |
## value of stolen bases | |
# only look at cases where there is a single stealing attempt | |
# and there is no plate appearance event | |
# Type variable indicates what the base of the SB attempt | |
SB_1 <- filter(SB, SB + CS == 1, | |
BAT_EVENT_FL == FALSE) | |
SB_1 <- mutate(SB_1, | |
Type=ifelse(SB2 + CS2 == 1, "2nd Base", | |
ifelse(SB3 + CS3 == 1, "3rd Base", | |
"Home"))) | |
# Construct a violin plot of the run values by Type | |
ggplot(SB_1, aes(Type, RUNS.VALUE, color=factor(SB))) + | |
geom_violin() + | |
coord_flip() + | |
ggtitle("Runs Value of Different Stealing Attempts") + | |
TH + | |
geom_hline(yintercept = 0, color="blue") | |
# Displays some summaries of the run values | |
SM <- summarize(group_by(SB_1, Type), | |
N=n(), | |
Mean=mean(RUNS.VALUE), | |
Median=median(RUNS.VALUE), | |
QS=diff(quantile(RUNS.VALUE, c(.25, .75)))) | |
SM[, -1] <- round(SM[, -1],3) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment