Skip to content

Instantly share code, notes, and snippets.

@bayesball
Last active July 3, 2020 15:13
Show Gist options
  • Save bayesball/18211e3e2539651d02baf9bf153acfd5 to your computer and use it in GitHub Desktop.
Save bayesball/18211e3e2539651d02baf9bf153acfd5 to your computer and use it in GitHub Desktop.
Markdown file that illustrates the simulation of a 60-game baseball season using a Bradley-Terry model.
---
title: "2020 Season Simulation"
author: Jim Albert
date: June 27, 2020
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE,
message = FALSE,
warning = FALSE)
```
#### Introduction
Simulates a 2020 season of 60 games using the Bradley-Terry model.
By repeated simulations, can explore the relationship between talent and season performance.
#### Main functions
Read in two functions:
- ```one.simulation.20()``` performs the simulation
- ```print_standings()``` prints the division standings of the simulation
```{r, echo = FALSE}
one.simulation.20 <- function(s.talent = 0.30){
require(dplyr)
make.schedule <- function(teams, k){
n.teams <- length(teams)
Home <- rep(rep(teams, each=n.teams), k)
Visitor <- rep(rep(teams, n.teams), k)
schedule <- tibble(Home = Home,
Visitor = Visitor)
dplyr::filter(schedule, Home != Visitor)
}
NL_East <- c("ATL", "MIA", "NYN", "PHI", "WAS")
NL_Cent <- c("CHN", "CIN", "MIL", "PIT", "SLN")
NL_West <- c("ARI", "COL", "LAN", "SDN", "SFN")
AL_East <- c("BAL", "BOS", "NYA", "TBA", "TOR")
AL_Cent <- c("CHA", "CLE", "DET", "KCR", "MIN")
AL_West <- c("OAK", "HOU", "TEX", "LAA", "SEA")
teams <- c(NL_East, NL_Cent, NL_West,
AL_East, AL_Cent, AL_West)
league <- c(rep("NL", 15), rep("AL", 15))
division <- c(rep("NL_East", 5), rep("NL_Cent", 5),
rep("NL_West", 5), rep("AL_East", 5),
rep("AL_Cent", 5), rep("AL_West", 5))
Team_info <- data.frame(Team = teams,
League = league,
Division = division)
s1 <- make.schedule(NL_East, 3)
s2 <- make.schedule(AL_East, 3)
s3 <- make.schedule(c(NL_East, AL_East), 2)
s4 <- make.schedule(NL_Cent, 3)
s5 <- make.schedule(AL_Cent, 3)
s6 <- make.schedule(c(NL_Cent, AL_Cent), 2)
s7 <- make.schedule(NL_West, 3)
s8 <- make.schedule(AL_West, 3)
s9 <- make.schedule(c(NL_West, AL_West), 2)
schedule <- rbind(s1, s2, s3, s4, s5, s6,
s7, s8, s9)
# simulate talents
talents <- rnorm(30, 0, s.talent)
TAL <- tibble(Team = teams,
Talent = talents)
# merge talents and win probs with schedule data frame
SCH <- schedule %>%
inner_join(TAL, by = c("Home" = "Team")) %>%
rename(Talent.Home = Talent) %>%
inner_join(TAL, by = c("Visitor" = "Team")) %>%
rename(Talent.Visitor = Talent)
# play season of games
SCH %>%
mutate(prob.Home = exp(Talent.Home) /
(exp(Talent.Home) + exp(Talent.Visitor))) -> SCH
SCH %>%
mutate(outcome = rbinom(nrow(.), 1, prob.Home),
winner = ifelse(outcome,
Home, Visitor)) -> SCH
# compute number of games won for all teams
SCH %>%
group_by(winner) %>%
summarize(Wins = n(), .groups = "drop") %>%
inner_join(TAL, by = c("winner" = "Team")) ->
RESULTS
# add team info
RESULTS <- inner_join(RESULTS, Team_info,
by = c("winner" = "Team"))
# determine division winners
out <- RESULTS %>%
mutate(Winner.Div = 0,
prob = exp(Talent),
outcome = sample(nrow(.), prob = prob)) %>%
arrange(Division, desc(Wins), outcome) %>%
dplyr::select(-outcome)
out[c(1, 6, 11, 16, 21, 26), "Winner.Div"] <- 1
# determine wild card teams
out2 <- out %>%
mutate(Wild.Card = 0,
outcome = sample(nrow(.), prob = prob)) %>%
slice(- c(1, 6, 11, 16, 21, 26)) %>%
arrange(League, desc(Wins), outcome) %>%
dplyr::select(-outcome)
out2[c(1, 2, 13, 14), "Wild.Card"] <- 1
# put all data together
out$Wild.Card <- 0
out_final <- rbind(out[c(1, 6, 11, 16, 21, 26), ],
out2) %>%
arrange(Division, desc(Wins)) %>%
dplyr::select(-prob) %>%
mutate(Team = winner) %>%
dplyr::select(Team, League, Division,
Talent, Wins,
Winner.Div, Wild.Card)
out_final
}
print_standings <- function(out){
require(dplyr)
out %>%
mutate(Losses = 60 - Wins,
Line = paste(Team, Wins, Losses, sep=" ")) %>%
dplyr::select(Line) -> out2
breaks <- c("-------------", "-------------")
Heading1 <- c(" AL East W L", "NL East W L")
results1 <- cbind(out2[6:10, ], out2[21:25, ])
names(results1) <- c("A", "B")
d1 <- rbind(Heading1, breaks, results1)
Heading2 <- c("AL Cent W L", "NL Cent W L")
results2 <- cbind(out2[1:5, ], out2[16:20, ])
names(results2) <- c("A", "B")
d2 <- rbind(Heading2, breaks, results2)
Heading3 <- c("AL West W L", "NL West W L")
results3 <- cbind(out2[11:15, ], out2[26:30, ])
names(results3) <- c("A", "B")
d3 <- rbind(Heading3, breaks, results3)
d <- rbind(d1, "", d2, "", d3)
names(d) <- c("", "")
rownames(d) <- NULL
print(d, row.names = FALSE)
}
```
Read in several packages that I'll need.
```{r}
library(ggplot2)
library(dplyr)
library(MASS)
```
Also I am adding several helper functions ```increasefont()``` and ```centertitle()``` that I will use in ggplot2.
```{r}
increasefont <- function(Size = 18){
theme(text = element_text(size = Size))
}
centertitle <- function(Color = "blue"){
theme(plot.title =
element_text(colour = Color, size = 18,
hjust = 0.5, vjust = 0.8, angle = 0))
}
```
#### One simulation
One simulation assuming a standard deviation of 0.3.
```{r}
set.seed(123)
out <- one.simulation.20(0.3)
print_standings(out)
```
#### 1000 Simulations
Store the results of 1000 simulated seasons:
```{r}
all_out <- NULL
for(j in 1:1000){
out <- one.simulation.20(0.3)
out$Simulation <- j
all_out <- rbind(all_out, out)
}
```
#### Talent and probability of making playoffs
Define a new variable Level that is 0 (missed playoffs), 1 (wild-card), or 2 (win division).
```{r}
all_out %>%
mutate(Level = as.factor(2 * (Winner.Div == 1) +
(Wild.Card == 1))) -> all_out
```
Level is an ordinal response variable. Using a proportional odds model, model the ordinal response as a function of the team's talent.
For a range of talent values from -0.9 to 0.9, find the fitted model probability of the three outcomes.
```{r}
pfit <- polr(Level ~ Talent,
data = all_out)
DF <- data.frame(Talent = seq(-0.9, 0.9,
length.out = 200))
Prob <- predict(pfit, DF, type = "probs")
DF1 <- DF2 <- DF3 <- DF
DF1$Probability <- Prob[, 3]; DF1$Type <- "Division"
DF2$Probability <- Prob[, 2]; DF2$Type <- "Wild Card"
DF3$Probability <- Prob[, 3] + Prob[, 2]
DF3$Type <- "Playoff"
DFall <- rbind(DF1, DF2, DF3)
```
Graph of this division, wild card, and playoff probabilities.
```{r}
ggplot(DFall, aes(Talent, Probability,
color = Type)) +
geom_line() +
increasefont() +
ggtitle("Probability of Reaching Different Levels") +
centertitle()
```
#### Talents of wild card teams and division-winners
Construct density estimates of
- all teams
- teams that win division
- teams that get wild card
```{r}
data1 <- filter(all_out, Wild.Card == 1) %>%
mutate(Type = "Wild Card")
data2 <- filter(all_out, Winner.Div == 1) %>%
mutate(Type = "Win Division")
data3 <- all_out %>%
mutate(Type = "All Teams")
ggplot(rbind(data1, data2, data3),
aes(Talent, color = Type)) +
geom_density(size = 1.5) +
increasefont() + centertitle() +
ggtitle("Talent Distributions for Three Types of Teams")
```
#### How many games to win?
Find the conditional probability of winning division title, of getting wild card, and making the playoffs
given you won X games.
```{r}
all_out %>%
filter(Wins >= 25, Wins <= 45) %>%
group_by(Wins) %>%
summarize(N = n(),
Prob_Div = mean(Winner.Div),
Prob_WC = mean(Wild.Card),
.groups = "drop") -> S
```
Plot smooths of these empirical probabilities.
```{r}
S %>%
ggplot() +
geom_smooth(aes(Wins, Prob_Div), se = FALSE,
color = "red",
method = "loess") +
geom_smooth(aes(Wins, Prob_WC), se = FALSE,
color = "blue",
method = "loess") +
geom_smooth(aes(Wins, Prob_Div + Prob_WC),
se = FALSE,
color = "black",
method = "loess") +
increasefont() +
ylab("Probability") +
annotate(geom = "text", x = 32, y = 0.75,
label = "Playoffs", size = 7) +
annotate(geom = "text", x = 40.3, y = 0.75,
label = "Division", size = 7,
color = "red") +
annotate(geom = "text", x = 40.8, y = 0.25,
label = "Wild Card", size = 7,
color = "blue") +
ggtitle("How Many Games Does Team Need to Win?") +
centertitle()
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment