Created
April 16, 2018 10:58
-
-
Save bayesball/822827aa54fd36f8c135c02225ff9628 to your computer and use it in GitHub Desktop.
Use of Bradley-Terry Model and predictive simulation to check if standard deviation of team wins is large
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(baseballr) | |
library(tidyverse) | |
NL <- standings_on_date_bref("2018-04-15", "NL Overall") | |
AL <- standings_on_date_bref("2018-04-15", "AL Overall") | |
#save(AL, NL, file="Standings") | |
load("Standings") | |
library(ggrepel) | |
ggplot(data.frame(Team = c(AL[[1]]$Tm, NL[[1]]$Tm), | |
Wins = c(AL[[1]]$W, NL[[1]]$W), | |
Y = 1), | |
aes(Wins, Y, label=Team)) + | |
geom_text_repel(angle = 60, color="blue") + | |
theme(axis.title.y=element_blank(), | |
axis.text.y=element_blank(), | |
axis.ticks.y=element_blank()) + | |
ggtitle("Number of Games Won by MLB Teams: April 15, 2018") | |
# get schedule? | |
teams <- unique(c(AL[[1]]$Tm, NL[[1]]$Tm)) | |
get_schedule <- function(team){ | |
d <- team_results_bref(team, 2018) | |
d[complete.cases(d),] | |
} | |
sch <- map_df(teams, get_schedule) | |
#save(sch, file="Schedule") | |
load("Schedule") | |
sch %>% group_by(Attendance) %>% | |
summarize(Date = first(Date), | |
Tm = first(Tm), | |
Opp = first(Opp), | |
R = first(R), | |
RA = first(RA)) %>% | |
mutate(Win = ifelse(R > RA, 1, 0)) -> sch2 | |
# here is the function that implements the simulation | |
one.simulation.18a <- function(teams, schedule, | |
s.talent = 0.25){ | |
# simulate talents | |
TAL <- data.frame(Team = teams, | |
Talent = rnorm(30, 0, s.talent)) | |
TAL$Team <- as.character(TAL$Team) | |
# merge talents and win probs with schedule data frame | |
schedule %>% | |
inner_join(TAL, by=c("Tm"="Team")) %>% | |
inner_join(TAL, by=c("Opp"="Team")) -> SCH | |
# play season | |
N <- nrow(SCH) | |
SCH %>% | |
mutate(prob.Tm = exp(Talent.x) / | |
(exp(Talent.x) + exp(Talent.y)), | |
outcome = rbinom(nrow(SCH), 1, prob.Tm), | |
winner = ifelse(outcome, Tm, Opp)) -> SCH | |
# compute sd of number of games won for all teams | |
SCH %>% group_by(winner) %>% | |
count() -> Results | |
# output sd | |
sd(Results$n) | |
} | |
one.simulation.18a(teams, select(sch2, Tm, Opp)) | |
W <- replicate(1000, | |
one.simulation.18a(teams, select(sch2, Tm, Opp))) | |
observed_sd <- sd(c(AL[[1]]$W, NL[[1]]$W)) | |
ggplot(data.frame(SD = W), aes(W)) + | |
geom_histogram(bins = 10, | |
color="black", | |
fill="orange") + | |
geom_vline(xintercept = observed_sd, | |
color="red", size=2) + | |
xlab("Standard Deviation") + | |
ggtitle("Histogram of Predictive Standard Deviation | |
of Number of Games Won from Bradley-Terry Model") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Jim - thanks for this and for the recent wordpress post. In case you're interested, feel free to check out my related package, mlbstats, which is an individual player metrics calculator, for whatever it may be worth. Thanks for this great work extending the scientific study of baseball!