Last active
September 3, 2020 15:11
-
-
Save ilarischeinin/7ef934c00033d3fc0803e27324eb110a to your computer and use it in GitHub Desktop.
Ranking NHL’s best shooters with Bayesian multilevel modeling - https://ilari.scheinin.fi/ranking-nhls-best-shooters/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Ranking NHL’s best shooters with Bayesian multilevel modeling | |
# https://ilari.scheinin.fi/ranking-nhls-best-shooters/ | |
# Ilari Scheinin | |
# firstname.lastname@gmail.com | |
# Code: MIT Licence | |
# Figures and tables: CC-BY | |
# For inclusion in the blog post, the main code is split into snippets in | |
# separate files named like *-chunk.R. | |
# This file exists purely for the boilerplate above, and for running the entire | |
# analysis from the command line: | |
# Rscript 00-sharp-shooter.R | |
set.seed(20162017L) | |
png( | |
"sharp-shooter-%i.png", | |
width = 210L, height = 148L, units = "mm", res = 150L | |
) | |
# List the *-chunk.R files and source them. This would be more readable with | |
# pipes and purr::walk(), but unlike the main code chunks, this file | |
# intentionally sticks with base R. | |
invisible(lapply(list.files(pattern = "^[0-9]*-chunk.R$"), source, echo = TRUE)) | |
dev.off() | |
save.image("sharp-shooter-session.rda") | |
# EOF |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load the packages used. Install them first if needed. | |
library(tidyverse) | |
library(rstan) | |
library(DT) | |
library(knitr) | |
library(scales) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
nhl <- read_csv("player-shots-and-goals-by-season.csv") | |
# How to obtain the raw data is not covered here. There are a few sites that | |
# host NHL stats, and the one I used was hockey-reference.com. In addition to | |
# the columns shown in the blog post, the data file contains an integer column | |
# player_id, because there are a few instances of two players with the same | |
# name. | |
nhl %>% | |
head(n = 10L) %>% | |
select(player, position, season, shots, goals) %>% | |
kable(format = "html", row.names = TRUE, format.args = list(big.mark = ",")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# These new columns will be used later. | |
nhl <- | |
nhl %>% | |
mutate( | |
year = season %>% sub(pattern = "^.*-", replacement = "") %>% as.integer(), | |
season_id = as.integer(as.factor(season)), | |
position_id = as.integer(as.factor(position)) | |
) | |
players <- | |
nhl %>% | |
group_by(player_id) %>% | |
summarise( | |
player = first(player), | |
position_id = first(position_id), | |
position = first(position), | |
start = min(year) - 1L, | |
end = max(year), | |
mid = (start + end) / 2, | |
career = | |
paste0(start, "–", ifelse(end == max(nhl$year), yes = "", no = end)), | |
shots = sum(shots), | |
goals = sum(goals), | |
raw = goals / shots | |
) | |
players %>% | |
head(n = 10L) %>% | |
select(player, position, career, shots, goals, raw) %>% | |
mutate( | |
raw = ifelse(is.na(raw), yes = "", no = sprintf("%.2f%%", raw * 100)) | |
) %>% | |
kable(format = "html", row.names = TRUE, align = "lllrrr", | |
format.args = list(big.mark = ",") | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
players %>% | |
ggplot(aes(raw, color = position)) + | |
geom_density() + | |
scale_x_continuous(labels = percent) + | |
labs( | |
subtitle = "Career shooting percentages per player position", | |
x = "raw career shooting percentage" | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
nhl %>% | |
group_by(year) %>% | |
summarise(average = sum(goals) / sum(shots)) %>% | |
ggplot(aes(year, average)) + | |
geom_point() + | |
scale_x_continuous(labels = function(x) paste0(x - 1L, "-", x)) + | |
scale_y_continuous(labels = percent, limits = c(0, 0.15)) + | |
labs( | |
subtitle = "Overall shooting percentages per season", | |
x = "season", | |
y = "overall league shooting percentage" | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data { | |
int<lower=1> n_observations; | |
int<lower=1> n_seasons; | |
int<lower=1> n_players; | |
int<lower=1> n_positions; | |
int<lower=1,upper=n_seasons> season[n_observations]; | |
int<lower=1,upper=n_players> player[n_observations]; | |
int<lower=1,upper=n_positions> position[n_players]; | |
int<lower=0> shots[n_observations]; | |
int<lower=0> goals[n_observations]; | |
} | |
parameters { | |
real league_mu; | |
real<lower=0> league_sigma; | |
vector[n_positions] position_mu; | |
vector<lower=0>[n_positions] position_sigma; | |
vector[n_players] skill; | |
vector[n_seasons-1] difficulty_raw; | |
} | |
transformed parameters { | |
vector[n_seasons] difficulty; // so that they sum up to zero | |
difficulty[1:(n_seasons-1)] = difficulty_raw; | |
difficulty[n_seasons] = -sum(difficulty_raw); | |
} | |
model { | |
league_mu ~ normal(-2, 1); // 95% probability to be between 2% and 50% | |
league_sigma ~ cauchy(0, 1); | |
position_mu ~ normal(league_mu, league_sigma); | |
position_sigma ~ cauchy(0, 1); | |
skill ~ normal(position_mu[position], position_sigma[position]); | |
difficulty_raw ~ normal(0, 1); | |
goals ~ binomial_logit(shots, skill[player] - difficulty[season]); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fit <- | |
stan("06-model.stan", | |
data = list( | |
n_observations = nrow(nhl), | |
n_seasons = max(nhl$season_id), | |
n_players = nrow(players), | |
n_positions = max(nhl$position_id), | |
season = nhl$season_id, | |
player = nhl$player_id, | |
position = players$position_id, | |
shots = nhl$shots, | |
goals = nhl$goals | |
), | |
seed = 20162017L | |
) | |
# Check diagnostic plots to make sure that chains converged. | |
# shinystan::launch_shinystan(fit) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
posterior <- as.matrix(fit) | |
nhl <- | |
nhl %>% | |
mutate( | |
simulation = pmap_int( | |
list(shots, player_id, season_id), | |
~rbinom( | |
1L, | |
size = ..1, | |
prob = plogis( | |
posterior[, paste0("skill[", ..2, "]")] - | |
posterior[, paste0("difficulty[", ..3, "]")] | |
) | |
) | |
) | |
) | |
nhl %>% | |
group_by(player_id) %>% | |
summarise( | |
position = first(position), | |
actual = sum(goals) / sum(shots), | |
simulated = sum(simulation) / sum(shots) | |
) %>% | |
gather(key, value, actual, simulated) %>% | |
ggplot(aes(value, linetype = key, color = position)) + | |
geom_density() + | |
scale_x_continuous(labels = percent) + | |
scale_linetype_discrete(name = "goals") + | |
labs( | |
subtitle = paste("Simulated and actual career shooting percentages per", | |
"player position" | |
), | |
x = "raw career shooting percentage" | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
nhl %>% | |
group_by(year) %>% | |
summarise( | |
actual = sum(goals) / sum(shots), | |
simulated = sum(simulation) / sum(shots) | |
) %>% | |
gather(key, value, -year) %>% | |
ggplot(aes(year, value, color = key)) + | |
geom_point() + | |
scale_x_continuous(labels = function(x) paste0(x - 1L, "-", x)) + | |
scale_y_continuous(labels = percent, limits = c(0, 0.15)) + | |
scale_color_discrete(name = "goals") + | |
labs( | |
subtitle = "Simulated and actual overall shooting percentages per season", | |
x = "season", | |
y = "overall league shooting percentage" | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ggplot(nhl, aes(goals, simulation)) + | |
geom_jitter(alpha = 0.1) + | |
geom_abline(slope = 1, intercept = 0) + | |
coord_cartesian(xlim = c(0, 100), ylim = c(0, 100)) + | |
labs( | |
subtitle = "Simulated and actual goals scored per player per season", | |
x = "goals scored (actual)", | |
y = "goals scored (simulated)" | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
players <- | |
players %>% | |
left_join( | |
posterior[, grep("^skill", colnames(posterior), value = TRUE)] %>% | |
apply(2L, median) %>% | |
tibble( | |
player_id = names(.) %>% | |
gsub(pattern = "[^0-9]", replacement = "") %>% | |
as.integer(), | |
model = . | |
) %>% | |
mutate(model = plogis(model)), | |
by = "player_id" | |
) %>% | |
arrange(desc(model)) | |
players %>% | |
ggplot(aes(raw, model, color = mid)) + | |
geom_point(alpha = 0.1) + | |
scale_x_continuous(labels = percent) + | |
scale_y_continuous(labels = percent) + | |
scale_color_continuous("career midpoint") + | |
labs( | |
subtitle = "Raw career shooting percentages and modeled shooting skills", | |
x = "raw career shooting percentage", | |
y = "modeled shooting skill" | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
players %>% | |
head(n = 10L) %>% | |
select(player, position, career, shots, goals, raw, model) %>% | |
mutate( | |
raw = sprintf("%.2f%%", raw * 100), | |
model = sprintf("%.2f%%", model * 100) | |
) %>% | |
kable(format = "html", row.names = TRUE, align = "lllrrrr", | |
format.args = list(big.mark = ",") | |
) | |
players %>% | |
filter(position == "defenceman") %>% | |
head(n = 10L) %>% | |
select(player, position, career, shots, goals, raw, model) %>% | |
mutate( | |
raw = sprintf("%.2f%%", raw * 100), | |
model = sprintf("%.2f%%", model * 100) | |
) %>% | |
kable(format = "html", row.names = TRUE, align = "lllrrrr", | |
format.args = list(big.mark = ",") | |
) | |
players %>% | |
select(player, position, career, shots, goals, raw, model) %>% | |
datatable(filter = "top") %>% | |
formatCurrency("shots", currency = "", mark = ",", digits = 0L) %>% | |
formatPercentage("raw", digits = 2L) %>% | |
formatPercentage("model", digits = 2L) %>% | |
saveWidget("sharp-shooter.html") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment