Skip to content

Instantly share code, notes, and snippets.

@bayesball
Created December 31, 2019 13:19
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/8d6719dea05a5ea4e8d5930bdfa4a6cf to your computer and use it in GitHub Desktop.
Save bayesball/8d6719dea05a5ea4e8d5930bdfa4a6cf to your computer and use it in GitHub Desktop.
R work for why are baseball games so long post
library(tidyverse)
library(CalledStrike)
library(broom)
######################################
# function retro_work()
######################################
# given a Retrosheet play-by-play dataset
# collects the number of PAs and
# number of pitches for all games in
# specific season
retro_work <- function(d){
d %>%
mutate(pseq = gsub("[.>123N+*]", "", PITCH_SEQ_TX),
N.Pitches = nchar(pseq)) -> d
d %>%
group_by(GAME_ID) %>%
summarize(PA = n(),
N_Pitches = sum(N.Pitches),
maxINNING = max(INN_CT)) %>%
filter(maxINNING == 9) -> S
S$GAME_ID <- as.character(S$GAME_ID)
S
}
########################################
# function one_season()
########################################
# inputs are the season number and the output from
# the retro_work() function
# function will read in the Retrosheet game logs
# dataset from disk and merge the game duration variable
# with the retroS data frame
one_season <- function(season, retroS){
filename <- paste("~/Dropbox/Google Drive/gamelogs/gamelogs/gl",
season, ".txt", sep = '')
gdata <- read_csv(filename, col_names = FALSE)
header <- read_csv("~/Dropbox/Google Drive/gamelogs/gamelogs/game_log_header.csv")
names(gdata) <- names(header)
gdata %>%
mutate(GAME_ID = paste(HomeTeam, Date, DoubleHeader,
sep = "")) -> gdata
inner_join(retroS, select(gdata, GAME_ID, Duration),
by = "GAME_ID") -> retroS
retroS$Season <- season
retroS
}
# using the two functions
# load the Retrosheet play-by-play dataset on disk
load("~/Dropbox/Google Drive/Retrosheet/pbp.2019.Rdata")
S2019 <- retro_work(d2019)
S2019 <- one_season(2019, S2019)
# I repeated this operation for all seasons 2000 through
# 2019 -- row merged these data frames into the data frame
# Sall
# graph of the mean duration of the games against season
Sall %>%
group_by(Season) %>%
summarize(M = mean(Duration)) %>%
ggplot(aes(Season, M)) +
geom_point(size = 4, color = "red") +
geom_smooth(method = "loess", se = FALSE) +
increasefont() +
ylab("Mean Duration (Min)") +
ggtitle("Mean Length of a 9 Inning Game") +
centertitle()
# graph of the mean number of PAs against season
Sall %>%
group_by(Season) %>%
summarize(M = mean(PA)) %>%
ggplot(aes(Season, M)) +
geom_point(size = 4, color = "red") +
geom_smooth(method = "loess", se = FALSE) +
increasefont() +
ylab("Mean PA") +
ggtitle("Mean Number of PA of a 9 Inning Game") +
centertitle()
# graph of the mean number of pitches per PA
Sall %>%
group_by(Season) %>%
summarize(M = mean(N_Pitches / PA)) %>%
ggplot(aes(Season, M)) +
geom_point(size = 4, color = "red") +
geom_smooth(method = "loess", se = FALSE) +
increasefont() +
ylab("Mean Pitches / PA") +
ggtitle("Mean Number of Pitches per PA") +
centertitle()
# graph of the mean number of pitches per game
Sall %>%
group_by(Season) %>%
summarize(M = mean(N_Pitches)) %>%
ggplot(aes(Season, M)) +
geom_point(size = 4, color = "red") +
geom_smooth(method = "loess", se = FALSE) +
increasefont() +
ylab("Mean Pitches per Game") +
ggtitle("Mean Number of Pitches per Game") +
centertitle()
# fit regressions of (N_Pitches, Duration) for
# all seasons
regressions <- Sall %>% group_by(Season) %>%
do(tidy(lm(Duration ~ N_Pitches, data=.)))
# graph of regression slopes against season
regressions %>%
filter(term == "N_Pitches") %>%
ggplot(aes(Season, estimate)) +
geom_point(size = 4, color = "red") +
geom_smooth(method = "loess", se = FALSE) +
increasefont() +
ylab("Mean Time Per Pitch (Min)") +
ggtitle("Mean Time Per Pitch") +
centertitle()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment