Skip to content

Instantly share code, notes, and snippets.

@darh78
Created January 2, 2017 18:17
Show Gist options
  • Save darh78/a8a30e6d03c74c5cd0d1fe5911fcab7f to your computer and use it in GitHub Desktop.
Save darh78/a8a30e6d03c74c5cd0d1fe5911fcab7f to your computer and use it in GitHub Desktop.
#title: Overall rank of MLB teams in regular season
#output: html_document
# author: D. Hernández
#tags: [R, baseball, lahman, MLB, postseason]
# Loading packages
library(Lahman) # Data source
library(dplyr) # To ease the data manipulation
### Data preparation
# getting the data from Lahman package: Since 1995 to 2015
Teams_data <- tbl_df(Teams) %>%
select(yearID, name, teamIDBR, W, L, DivWin, WCWin, WSWin) %>%
filter(yearID >= 1994)
#Let's add a variable with the winning percentage, `WLP`, and order the tibble by `yearID` and `WLP` (in descending order).
Teams_data <- mutate(Teams_data,
WLP = W/(W+L)) %>%
arrange(yearID, desc(WLP))
# Now that `Teams_data` is ordered, let's add a new variable with the overall rank (called `OverallRank`) for each team on each regular season, defining number 1 to the team with the best `WLP`, 2 for the team with the best second `WLP` and so on.
Teams_data <- mutate(Teams_data,
OverallRank = ave(WLP, yearID, FUN = seq_along))
# By the time of building this article, the R Lahman Package (in version 5.0-0) did not have the 2016 data. So, I had to build an Excel file to get the 2016 season's info [baseball-reference](http://www.baseball-reference.com/leagues/MLB/2016-standings.shtml). Then, bind this 2016 data to the `Teams_data` tibble.
# get 2016 data from baseball-reference
library(readxl) # To read Excel files into R
T2016 <- tbl_df(read_excel("C:/Users/1328/Documents/R projects/darh78.github.io/data/T2016.xlsx"))
# binding both tibbles
Teams_data <- rbind(Teams_data, T2016)
#During this period (1994-2016), four MLB franchises changed their names at least once, so I standardized each team ID with the ID of MLB franchises that played the 2016 season.
Teams_data <- mutate(Teams_data,
FranchID = ifelse(teamIDBR == "ANA" | teamIDBR == "CAL" | teamIDBR == "LAA", "LAA", # Anaheim Angels
ifelse(teamIDBR == "FLA" | teamIDBR == "MIA", "MIA", # Miami Marlins
ifelse(teamIDBR == "MON" | teamIDBR == "WAS" | teamIDBR == "WSN", "WSN", # Washington Nationals
ifelse(teamIDBR == "TBD" | teamIDBR == "TBR", "TBR", # Tampa Bay Rays
teamIDBR)
)
)
)
)
### Data cleaning
#Additionally, to better prepare `Teams_data` for the analysis, let's modify some of the `classes` of the variables and give better names to some of them:
Teams_data$WLP <- as.numeric(Teams_data$WLP)
Teams_data$yearID <- as.integer(Teams_data$yearID)
Teams_data$W <- as.integer(Teams_data$W)
Teams_data$L <- as.integer(Teams_data$L)
Teams_data$OverallRank <- as.integer(Teams_data$OverallRank)
Teams_data <- Teams_data %>% rename(Season = yearID, Team = name)
#Resulting on a tibble with this preview:
Teams_data %>% slice(c(1, 100, 200, 300, 400, 500))
### Exploratory Data Analysis
#Before analyzing or ploting anything, let's see a summary of the variables in the tibble `TeamsStd`:
summary(Teams_data)
#From this summary we can extract that the maximum number of wins by a team in a single season is 116, while the maximum number of losses is 119. Let's check who were those teams:
knitr::kable(Teams_data %>%
filter(W == 116 | L == 119) %>%
select(Season, Team, W, L, WLP, OverallRank))
#These two records correspond also to the minumim and maximum `WLP` registered in this period.
#Now, let's visualize the winning percentage of each team between 1994 and 2016.
library(ggplot2) # To visualize results
library(ggthemes)# To format vizes
Linegraph <- ggplot(Teams_data, aes(x = Season, y = OverallRank)) +
geom_point(aes(color = OverallRank), size = 1.35) + #color = "cadetblue3",
scale_colour_gradient2(low = "darkgreen", mid = "gold", high = "red", midpoint = 10) +
geom_smooth(color = "black", size = .7, se = F) +
guides(color = FALSE) +
scale_y_reverse(breaks = c(1,30)) +
facet_wrap(~ FranchID, ncol = 5) +
labs(title = "Overall rank of MLB teams in regular season",
subtitle = "based on WLP in the Wild Card Era (since 1994)",
caption = "Data from Lahman R package 5.0-0")+
theme_tufte() +
theme(axis.ticks = element_blank(),
panel.grid.major.y = element_line(colour = "gray86", linetype = "dotted", size = 0.1),
panel.grid.minor.y = element_blank(),
strip.text.x = element_text(size = 10, family = "serif", face = "bold", colour = "black", angle = 0),
axis.text.x=element_text(angle = 90, hjust = 0, vjust = 1, size = 7),
axis.text.y=element_text(angle = 0, hjust = 1, vjust = 0.5, size = 6)) +
scale_x_continuous(breaks = c(1994, 1998, 2002, 2006, 2010, 2014))
Linegraph
#Now, let's see how teams clinched the postseason and who of them became World Champs based on their overall rank during regular season.
#First let's add a new variable, called `clinch` with this info.
Teams_data <- mutate(Teams_data,
clinch = ifelse((DivWin == "Y" | WCWin == "Y") & WSWin == "N", "Clinched Playoff",
ifelse(WSWin == "Y", "World Champion", NA)))
#And let's add those results to the previous sparkline plot.
Linegraph_ps <- ggplot(Teams_data, aes(x = Season, y = OverallRank)) +
geom_line(color = "cadetblue3", size = .8) +
geom_point(aes(shape = clinch, color = clinch, fill = clinch)) +
scale_color_manual(name = "Team's performance",
breaks = c("Clinched Playoff", "World Champion"),
values = c("darkblue", "red3"),
labels = c("Clinched Playoff", "World Champion")) +
scale_shape_manual(name = "Team's performance",
breaks = c("Clinched Playoff", "World Champion"),
values = c(21, 18),
labels = c("Clinched Playoff", "World Champion")) +
scale_fill_manual(name = "Team's performance",
breaks = c("Clinched Playoff", "World Champion"),
values = c("white", "red3"),
labels = c("Clinched Playoff", "World Champion")) +
scale_y_reverse(breaks = c(1,30)) +
facet_wrap(~ FranchID, ncol = 5) +
labs(title = "Overall rank of MLB teams in regular season",
subtitle = "based on WLP in Wild Card Era (since 1995)",
caption = "Data from Lahman R package 5.0-0")+
theme_tufte() +
theme(axis.ticks = element_blank(),
panel.grid.major.y = element_line(colour = "gray86", linetype = "dotted", size = 0.1),
panel.grid.minor.y = element_blank(),
strip.text.x = element_text(size = 10, family = "serif", face = "bold", colour = "black", angle = 0),
axis.text.x=element_text(angle = 90, hjust = -2, vjust = 1, size = 7),
axis.text.y=element_text(angle = 0, hjust = 1, vjust = 0.5, size = 6)) +
scale_x_continuous(breaks = c(1994, 1998, 2002, 2006, 2010, 2014))
Linegraph_ps
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment