Created
June 28, 2014 16:49
-
-
Save bayesball/93b71c7c4be08dc8f203 to your computer and use it in GitHub Desktop.
R code to find all team streaks in MLB baseball
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# read in variable names for game log dataset | |
Headers <- c("Date", "DoubleHeader", | |
"DayOfWeek", "VisitingTeam", | |
"VisitingTeamLeague", "VisitingTeamGameNumber", | |
"HomeTeam", "HomeTeamLeague" , | |
"HomeTeamGameNumber", "VisitorRunsScored" , | |
"HomeRunsScore", "LengthInOuts", | |
"DayNight", "CompletionInfo", | |
"ForfeitInfo", "ProtestInfo" , | |
"ParkID", "Attendence" , | |
"Duration", "VisitorLineScore", | |
"HomeLineScore", "VisitorAB", | |
"VisitorH", "VisitorD" , | |
"VisitorT", "VisitorHR", | |
"VisitorRBI", "VisitorSH", | |
"VisitorSF", "VisitorHBP", | |
"VisitorBB", "VisitorIBB", | |
"VisitorK", "VisitorSB", | |
"VisitorCS", "VisitorGDP", | |
"VisitorCI", "VisitorLOB", | |
"VisitorPitchers", "VisitorER", | |
"VisitorTER", "VisitorWP", | |
"VisitorBalks", "VisitorPO", | |
"VisitorA", "VisitorE" , | |
"VisitorPassed", "VisitorDB" , | |
"VisitorTP", "HomeAB", | |
"HomeH", "HomeD", | |
"HomeT", "HomeHR", | |
"HomeRBI", "HomeSH", | |
"HomeSF" , "HomeHBP", | |
"HomeBB" , "HomeIBB", | |
"HomeK", "HomeSB" , | |
"HomeCS", "HomeGDP", | |
"HomeCI" , "HomeLOB", | |
"HomePitchers", "HomeER" , | |
"HomeTER", "HomeWP", | |
"HomeBalks", "HomePO", | |
"HomeA", "HomeE", | |
"HomePassed", "HomeDB", | |
"HomeTP", "UmpireHID", | |
"UmpireHName", "Umpire1BID", | |
"Umpire1BName", "Umpire2BID", | |
"Umpire2BName", "Umpire3BID", | |
"Umpire3BName", "UmpireLFID", | |
"UmpireLFName", "UmpireRFID", | |
"UmpireRFName", "VisitorManagerID", | |
"VisitorManagerName", "HomeManagerID", | |
"HomeManagerName", "WinningPitcherID", | |
"WinningPitcherName", "LosingPitcherID", | |
"LosingPitcherNAme", "SavingPitcherID", | |
"SavingPitcherName", "GameWinningRBIID", | |
"GameWinningRBIName", "VisitorStartingPitcherID", | |
"VisitorStartingPitcherName", "HomeStartingPitcherID", | |
"HomeStartingPitcherName", "VisitorBatting1PlayerID", | |
"VisitorBatting1Name", "VisitorBatting1Position", | |
"VisitorBatting2PlayerID", "VisitorBatting2Name", | |
"VisitorBatting2Position", "VisitorBatting3PlayerID", | |
"VisitorBatting3Name", "VisitorBatting3Position", | |
"VisitorBatting4PlayerID", "VisitorBatting4Name", | |
"VisitorBatting4Position", "VisitorBatting5PlayerID", | |
"VisitorBatting5Name", "VisitorBatting5Position", | |
"VisitorBatting6PlayerID", "VisitorBatting6Name", | |
"VisitorBatting6Position", "VisitorBatting7PlayerID", | |
"VisitorBatting7Name", "VisitorBatting7Position", | |
"VisitorBatting8PlayerID", "VisitorBatting8Name", | |
"VisitorBatting8Position", "VisitorBatting9PlayerID", | |
"VisitorBatting9Name", "VisitorBatting9Position", | |
"HomeBatting1PlayerID", "HomeBatting1Name", | |
"HomeBatting1Position", "HomeBatting2PlayerID", | |
"HomeBatting2Name", "HomeBatting2Position", | |
"HomeBatting3PlayerID", "HomeBatting3Name", | |
"HomeBatting3Position", "HomeBatting4PlayerID", | |
"HomeBatting4Name", "HomeBatting4Position", | |
"HomeBatting5PlayerID", "HomeBatting5Name", | |
"HomeBatting5Position", "HomeBatting6PlayerID", | |
"HomeBatting6Name", "HomeBatting6Position", | |
"HomeBatting7PlayerID", "HomeBatting7Name", | |
"HomeBatting7Position", "HomeBatting8PlayerID", | |
"HomeBatting8Name", "HomeBatting8Position", | |
"HomeBatting9PlayerID", "HomeBatting9Name", | |
"HomeBatting9Position", "AdditionalInfo", | |
"AcquisitionInfo") | |
# The function load.gamelog will read in the Retrosheet gamelog file for a particular season. | |
# The inputs are the season and the vector of names of the variables. | |
load.gamelog <- function(season, headers){ | |
download.file( | |
url <- paste("http://www.retrosheet.org/gamelogs/gl", season, ".zip" | |
, sep="") | |
, destfile <- paste("gl", season, ".zip", sep="") | |
) | |
unzip(paste("gl", season, ".zip", sep="")) | |
gamelog <- read.table(paste("gl", season, ".txt", sep="") | |
, sep=",", stringsAsFactors=F) | |
names(gamelog) <- headers | |
file.remove(paste("gl", season, ".zip", sep="")) | |
file.remove(paste("gl", season, ".txt", sep="")) | |
gamelog | |
} | |
# The file headerinfo.R creates a vector Header containing the variable names. | |
# We use the load.gamelog function to read in the game logs for the 2002 season. | |
gl2002 <- load.gamelog(2002, Headers) | |
# The function find.team.streaks finds the length of all winning and losing streaks | |
# for a specific team for a particular season. | |
find.team.streaks <- function(team, data){ | |
streaks <- function(y){ | |
n <- length(y) | |
where <- c(0, y, 0) == 0 | |
location.zeros <- (0 : (n + 1))[where] | |
streak.lengths <- diff(location.zeros) - 1 | |
streak.lengths[streak.lengths > 0] | |
} | |
home <- subset(data, HomeTeam == team) | |
home$GameNumber <- home$HomeTeamGameNumber | |
home$Win <- with(home, | |
ifelse(HomeRunsScore > VisitorRunsScored, 1, 0)) | |
visiting <- subset(data, VisitingTeam == team) | |
visiting$GameNumber <- visiting$VisitingTeamGameNumber | |
visiting$Win <- with(visiting, | |
ifelse(HomeRunsScore < VisitorRunsScored, 1, 0)) | |
streak.data <- rbind(home, visiting) | |
streak.data <- streak.data[order(streak.data$GameNumber), ] | |
winning.streaks <- streaks(streak.data$Win) | |
losing.streaks <- streaks(1 - streak.data$Win) | |
list(Winning = winning.streaks, Losing = losing.streaks) | |
} | |
# We use the find.team.streaks function to find the streak lengths of | |
# Oakland (team abbreviation "OAK") and Philadelphia (team abbreviation "PHI") | |
# for the 2002 season. | |
find.team.streaks("OAK", gl2002) | |
find.team.streaks("PHI", gl2002) | |
# The vector teams contains the team abbreviation for all teams. Then using the | |
# sapply function, we find the lengths of winning and losing streaks for all teams in the 2002 season. | |
teams <- as.character(unique(gl2002$HomeTeam)) | |
S <- sapply(teams, find.team.streaks, gl2002) | |
# We create a data frame containing the lengths of all streaks. There are three variables: | |
# Team, Streak, and Type (whether it is a winning streak or a losing streak). | |
D <- NULL | |
for(j in teams) | |
D <- rbind(D, data.frame(Team=j, Type="Winning", | |
Streak=S[["Winning", j]])) | |
for(j in teams) | |
D <- rbind(D, data.frame(Team=j, Type="Losing", | |
Streak=S[["Losing", j]])) | |
head(D) | |
# Construct a graph using ggplot2 showing the lengths of all streaks for all teams | |
# in the 2012 season. I jitter the points so one can see individual points. Also I | |
# compare the lengths of the winning and losing streaks. | |
library(ggplot2) | |
print(ggplot(D, aes(Team, Streak)) + | |
geom_point(position="jitter") + | |
coord_flip() + | |
facet_wrap(~ Type) + | |
ggtitle("Lengths of Streaks in 2002 Season")) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment