Created
November 12, 2017 03:02
-
-
Save bayesball/f31f87e6c43d2ae6703efc40a097d721 to your computer and use it in GitHub Desktop.
Exploring game durations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load some packages | |
library(dplyr) | |
library(Lahman) | |
library(ggplot2) | |
library(readr) | |
# add a theme for the ggplot title | |
TH <- theme(plot.title = element_text(colour = "blue", | |
size = 14, | |
hjust = 0.5, vjust = 0.8, angle = 0)) | |
# collect gamelog data for a particular season | |
get_gldata <- function(season){ | |
file_location <- paste("~/Google Drive/gamelogs/gamelogs/gl", | |
season, ".txt", sep="") | |
gldata <- read_csv(file_location, | |
col_names = FALSE) | |
headers <- read_csv("~/Google Drive/gamelogs/gamelogs/game_log_header.csv") | |
names(gldata) <- names(headers) | |
data.frame(Season = season, gldata) | |
} | |
# collect data for Halladay seasons | |
seasons <- 1999:2013 | |
ALL_GAMES <- NULL | |
for(j in 1:15){ | |
ALL_GAMES <- rbind(ALL_GAMES, get_gldata(seasons[j])) | |
} | |
# create new dataframe with starter ids and duration | |
ALL_GAMES %>% select(Season, Date, | |
HomeStartingPitcherID, Duration) %>% | |
mutate(Game = "Home") -> start1 | |
names(start1)[3] <- "StarterID" | |
ALL_GAMES %>% select(Season, Date, | |
VisitorStartingPitcherID, Duration) %>% | |
mutate(Game = "Away") -> start2 | |
names(start2)[3] <- "StarterID" | |
start <- rbind(start1, start2) | |
# compare durations for different seasons using boxplots | |
ggplot(start, aes(as.character(Season), Duration)) + | |
geom_boxplot() + coord_flip() + xlab("Season") + TH + | |
ggtitle("Boxplots of Game Durations for 15 Seasons") + | |
ylab("Duration (Minutes)") | |
# create an indicator variable for Halladay | |
start %>% mutate(Halladay = ifelse(StarterID == "hallr001", | |
"yes", "no")) -> start | |
# compare game durations with and without Halladay | |
ggplot(start, aes(as.character(Season), | |
Duration, color=Halladay)) + | |
geom_boxplot() + coord_flip() + xlab("Season") + TH + | |
ylab("Duration (Minutes)") + | |
ggtitle("Game Durations With and Without Halladay") | |
# find the 10 best (shortest) duration starters for | |
# each of the 15 seasons | |
start %>% group_by(Season, StarterID) %>% | |
summarize(N = n(), M= median(Duration)) %>% | |
filter(N >= 19) %>% arrange(Season, M) %>% slice(1:10) -> | |
toplist2 | |
# Find the starters who have appeared at least three | |
# times on the top-ten list | |
toplist2 %>% group_by(StarterID) %>% summarize(N=n()) -> | |
Names | |
Names2 <- filter(Names, N >= 3) | |
Names2 %>% inner_join(dplyr::select(Master, nameFirst, nameLast, | |
retroID), by=c("StarterID" = "retroID")) %>% | |
mutate(Name = paste(nameFirst, nameLast)) %>% | |
dplyr::select(Name, N) -> Names3 | |
ggplot(Names3, aes(reorder(Name, N), N)) + geom_point() + | |
coord_flip() + xlab("Starter") + | |
ggtitle("Number of Top-10 Appearances by Game Duration") + | |
TH + scale_y_continuous(limits = c(2, 10)) + | |
ylab("Number of Top-10 Appearances") | |
# Compare durations of Greg Maddux and Roy Halladay | |
start %>% inner_join(dplyr::select(Master, | |
nameFirst, nameLast, | |
retroID), by=c("StarterID" = "retroID")) %>% | |
mutate(Name = paste(nameFirst, nameLast)) -> start | |
ggplot(filter(start, StarterID == "hallr001" | | |
StarterID == "maddg002", | |
Season <= 2008), | |
aes(as.character(Season), Duration, color=Name)) + | |
geom_boxplot() + coord_flip() + TH + | |
ggtitle("Maddux vs. Halladay, 1999-2008") + | |
xlab("Season") + ylab("Duration (Minutes)") | |
# How has the median game duration changed over the | |
# last 50 seasons 1967-2016? | |
median_duration <- function(season){ | |
file_location <- paste("~/Google Drive/gamelogs/gamelogs/gl", | |
season, ".txt", sep="") | |
gldata <- read_csv(file_location, | |
col_names = FALSE) | |
headers <- read_csv("~/Google Drive/gamelogs/gamelogs/game_log_header.csv") | |
names(gldata) <- names(headers) | |
median(gldata$Duration, na.rm=TRUE) | |
} | |
seasons <- 1967:2016 | |
medians <- sapply(seasons, median_duration) | |
ggplot(data.frame(Season = seasons, | |
Median = medians), | |
aes(Season, Median)) + TH + | |
geom_point() + geom_smooth(span=0.3, color="red") + | |
ggtitle("Median Game Duration: 1967:2016") + | |
geom_hline(yintercept = 180, color="blue", | |
linetype="dashed") + | |
geom_hline(yintercept = 150, color="blue", | |
linetype="dashed") + | |
annotate(geom="text", x=2010, y=152, | |
label="2 1/2 hours", color="blue") + | |
annotate(geom="text", x=1975, y=182, | |
label="3 hours", color="blue") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment