Skip to content

Instantly share code, notes, and snippets.

@bayesball
Last active March 26, 2023 14:17
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/404e27a1485db67650601220146f5059 to your computer and use it in GitHub Desktop.
Save bayesball/404e27a1485db67650601220146f5059 to your computer and use it in GitHub Desktop.
Code to download Retrosheet game log data with the focus on studying game scores.
# Main function to get Retrosheet game scores
get_scores <- function(season){
require(dplyr)
require(readr)
load_gamelog <- function(season) {
glheaders <- read_csv("https://raw.githubusercontent.com/beanumber/baseball_R/master/data/game_log_header.csv")
remote <- paste0("http://www.retrosheet.org/gamelogs/gl",
season, ".zip")
local <- paste0("gl", season, ".zip")
download.file(url = remote, destfile = local)
unzip(local)
local_txt <- gsub(".zip", ".txt", local) %>%
toupper()
gamelog <- read_csv(local_txt,
col_names = names(glheaders),
na = character())
file.remove(local)
file.remove(local_txt)
return(gamelog)
}
load_gamelog(season) %>%
select(Date, HomeTeam, VisitingTeam,
HomeRunsScore, VisitorRunsScored) %>%
mutate(Season = season)
}
# get scores for the 2019 season
d <- get_scores(2019)
# get scores for the past 50 seasons
library(purrr)
df <- map_df(1970:2019, get_scores)
library(dplyr)
library(ggplot2)
library(ProbBayes)
# most runs scored in a game?
df %>%
mutate(Runs = HomeRunsScore +
VisitorRunsScored) %>%
filter(Runs == max(Runs))
# greatest blowout?
df %>%
mutate(Margin_Victory = abs(HomeRunsScore -
VisitorRunsScored)) %>%
filter(Margin_Victory == max(Margin_Victory))
# graph of mean total runs scored against season
p1 <- df %>%
group_by(Season) %>%
summarize(Runs = mean(HomeRunsScore +
VisitorRunsScored),
.groups = "drop") %>%
ggplot(aes(Season, Runs)) +
geom_point() +
geom_smooth(method = "loess",
span = 0.3) +
ggtitle("Total Runs Scored") +
centertitle() + increasefont()
# graph of mean win margin against season
p2 <- df %>%
group_by(Season) %>%
summarize(Win_Margin = mean(abs(HomeRunsScore -
VisitorRunsScored)),
.groups = "drop") %>%
ggplot(aes(Season, Win_Margin)) +
geom_point() +
geom_smooth(method = "loess",
span = 0.3) +
ggtitle("Win Margin") +
centertitle() + increasefont()
# place both plots in the same plotting window
library(gridExtra)
grid.arrange(p1, p2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment