Last active
January 5, 2017 01:59
-
-
Save bayesball/b5f67bd3923de99454fa1a2093ab8013 to your computer and use it in GitHub Desktop.
Using Retrosheet game log data to explore game durations for the 2016 season
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# game log data available at | |
# http://www.retrosheet.org/gamelogs/index.html | |
# csv file with headers available at | |
# https://github.com/maxtoki/baseball_R/blob/master/data/game_log_header.csv | |
# my gamelog data for the 2016 season stored in | |
# data frame d | |
# this is some theme information that I'll using in | |
# my ggplot2 plots | |
TH <- theme( | |
plot.title = element_text( | |
colour = "blue", | |
size = 18, | |
hjust = 0.5, | |
vjust = 0.8, | |
angle = 0 | |
) | |
) | |
# load in some packages | |
library(lubridate) | |
library(ggplot2) | |
library(dplyr) | |
# convert Date variable to number of days past April 1? | |
d$date <- ymd(d$Date) | |
d$day <- yday(d$date) - yday("2016-04-01") | |
# graph game duration against day number | |
ggplot(d, aes(day, Duration)) + geom_point(size=0.5) + | |
geom_smooth() + | |
ggtitle("Length of Game by Date") + TH | |
# graph game duration against the home team id | |
ggplot(d, aes(HomeTeam, Duration)) + geom_boxplot() + | |
coord_flip() | |
# Day of Week effect? | |
# I first sort the days of week by the median game length | |
summarize(group_by(d, DayOfWeek), M=median(Duration)) %>% | |
arrange(M) -> S | |
d$DayOfWeek <- factor(d$DayOfWeek, | |
levels=S$DayOfWeek[order(S$M)]) | |
ggplot(d, aes(DayOfWeek, Duration)) + geom_boxplot() + | |
coord_flip() + | |
ggtitle("Length of Game by Day of Week") + TH | |
# Compute some additional variables | |
# Number of non-outs, runs scored, number left-on-base, | |
# left-on-base fraction | |
d <- mutate(d, | |
NonOuts = HomeH + HomeBB + HomeHBP + | |
VisitorH + VisitorBB + VisitorHBP, | |
Runs = HomeRunsScore + VisitorRunsScored, | |
LOB = HomeLOB + VisitorLOB, | |
LOB_Frac = LOB / NonOuts) | |
# strong relationship of nonouts with duration | |
ggplot(d, aes(NonOuts, Duration)) + | |
geom_point(size=0.5) + | |
geom_smooth() + | |
ggtitle("Length of Game by NonOuts") + TH | |
# limit to 9 inning games | |
d9 <- filter(d, LengthInOuts > 50, LengthInOuts < 55) | |
# final regression | |
fit3 <- lm(Duration ~ NonOuts + LOB_Frac, data=d9) | |
# add residuals to data frame and then plot residuals | |
d9$Residual <- fit3$residuals | |
ggplot(d9, aes(NonOuts, Residual)) + | |
geom_point(size=0.5) + geom_hline(yintercept = 0) + | |
ggtitle("The Residuals") + TH | |
# fit is 95.77 + 2.71 * NonOuts + 35.32 * LOB_Frac | |
# NonOuts (10, 40) | |
# LOB (.2, .9) | |
FIT <- NULL | |
for (lob in seq(.2, .9, by=.3)){ | |
df <- data.frame(NonOuts=10:40, LOB_Frac=lob) | |
df$Duration <- predict(fit3, df) | |
df$LOB_Frac <- factor(df$LOB_Frac) | |
FIT <- rbind(FIT, df) | |
} | |
# plot the fit | |
ggplot(FIT, aes(NonOuts, Duration, | |
color=LOB_Frac)) + TH + | |
geom_line() + ggtitle("Predicted Length of Game") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment