Skip to content

Instantly share code, notes, and snippets.

@bayesball
Created November 12, 2017 03:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/f31f87e6c43d2ae6703efc40a097d721 to your computer and use it in GitHub Desktop.
Save bayesball/f31f87e6c43d2ae6703efc40a097d721 to your computer and use it in GitHub Desktop.
Exploring game durations
# load some packages
library(dplyr)
library(Lahman)
library(ggplot2)
library(readr)
# add a theme for the ggplot title
TH <- theme(plot.title = element_text(colour = "blue",
size = 14,
hjust = 0.5, vjust = 0.8, angle = 0))
# collect gamelog data for a particular season
get_gldata <- function(season){
file_location <- paste("~/Google Drive/gamelogs/gamelogs/gl",
season, ".txt", sep="")
gldata <- read_csv(file_location,
col_names = FALSE)
headers <- read_csv("~/Google Drive/gamelogs/gamelogs/game_log_header.csv")
names(gldata) <- names(headers)
data.frame(Season = season, gldata)
}
# collect data for Halladay seasons
seasons <- 1999:2013
ALL_GAMES <- NULL
for(j in 1:15){
ALL_GAMES <- rbind(ALL_GAMES, get_gldata(seasons[j]))
}
# create new dataframe with starter ids and duration
ALL_GAMES %>% select(Season, Date,
HomeStartingPitcherID, Duration) %>%
mutate(Game = "Home") -> start1
names(start1)[3] <- "StarterID"
ALL_GAMES %>% select(Season, Date,
VisitorStartingPitcherID, Duration) %>%
mutate(Game = "Away") -> start2
names(start2)[3] <- "StarterID"
start <- rbind(start1, start2)
# compare durations for different seasons using boxplots
ggplot(start, aes(as.character(Season), Duration)) +
geom_boxplot() + coord_flip() + xlab("Season") + TH +
ggtitle("Boxplots of Game Durations for 15 Seasons") +
ylab("Duration (Minutes)")
# create an indicator variable for Halladay
start %>% mutate(Halladay = ifelse(StarterID == "hallr001",
"yes", "no")) -> start
# compare game durations with and without Halladay
ggplot(start, aes(as.character(Season),
Duration, color=Halladay)) +
geom_boxplot() + coord_flip() + xlab("Season") + TH +
ylab("Duration (Minutes)") +
ggtitle("Game Durations With and Without Halladay")
# find the 10 best (shortest) duration starters for
# each of the 15 seasons
start %>% group_by(Season, StarterID) %>%
summarize(N = n(), M= median(Duration)) %>%
filter(N >= 19) %>% arrange(Season, M) %>% slice(1:10) ->
toplist2
# Find the starters who have appeared at least three
# times on the top-ten list
toplist2 %>% group_by(StarterID) %>% summarize(N=n()) ->
Names
Names2 <- filter(Names, N >= 3)
Names2 %>% inner_join(dplyr::select(Master, nameFirst, nameLast,
retroID), by=c("StarterID" = "retroID")) %>%
mutate(Name = paste(nameFirst, nameLast)) %>%
dplyr::select(Name, N) -> Names3
ggplot(Names3, aes(reorder(Name, N), N)) + geom_point() +
coord_flip() + xlab("Starter") +
ggtitle("Number of Top-10 Appearances by Game Duration") +
TH + scale_y_continuous(limits = c(2, 10)) +
ylab("Number of Top-10 Appearances")
# Compare durations of Greg Maddux and Roy Halladay
start %>% inner_join(dplyr::select(Master,
nameFirst, nameLast,
retroID), by=c("StarterID" = "retroID")) %>%
mutate(Name = paste(nameFirst, nameLast)) -> start
ggplot(filter(start, StarterID == "hallr001" |
StarterID == "maddg002",
Season <= 2008),
aes(as.character(Season), Duration, color=Name)) +
geom_boxplot() + coord_flip() + TH +
ggtitle("Maddux vs. Halladay, 1999-2008") +
xlab("Season") + ylab("Duration (Minutes)")
# How has the median game duration changed over the
# last 50 seasons 1967-2016?
median_duration <- function(season){
file_location <- paste("~/Google Drive/gamelogs/gamelogs/gl",
season, ".txt", sep="")
gldata <- read_csv(file_location,
col_names = FALSE)
headers <- read_csv("~/Google Drive/gamelogs/gamelogs/game_log_header.csv")
names(gldata) <- names(headers)
median(gldata$Duration, na.rm=TRUE)
}
seasons <- 1967:2016
medians <- sapply(seasons, median_duration)
ggplot(data.frame(Season = seasons,
Median = medians),
aes(Season, Median)) + TH +
geom_point() + geom_smooth(span=0.3, color="red") +
ggtitle("Median Game Duration: 1967:2016") +
geom_hline(yintercept = 180, color="blue",
linetype="dashed") +
geom_hline(yintercept = 150, color="blue",
linetype="dashed") +
annotate(geom="text", x=2010, y=152,
label="2 1/2 hours", color="blue") +
annotate(geom="text", x=1975, y=182,
label="3 hours", color="blue")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment