Skip to content

Instantly share code, notes, and snippets.

@ilarischeinin
Last active September 3, 2020 15:11
Show Gist options
  • Save ilarischeinin/7ef934c00033d3fc0803e27324eb110a to your computer and use it in GitHub Desktop.
Save ilarischeinin/7ef934c00033d3fc0803e27324eb110a to your computer and use it in GitHub Desktop.
Ranking NHL’s best shooters with Bayesian multilevel modeling - https://ilari.scheinin.fi/ranking-nhls-best-shooters/
# Ranking NHL’s best shooters with Bayesian multilevel modeling
# https://ilari.scheinin.fi/ranking-nhls-best-shooters/
# Ilari Scheinin
# firstname.lastname@gmail.com
# Code: MIT Licence
# Figures and tables: CC-BY
# For inclusion in the blog post, the main code is split into snippets in
# separate files named like *-chunk.R.
# This file exists purely for the boilerplate above, and for running the entire
# analysis from the command line:
# Rscript 00-sharp-shooter.R
set.seed(20162017L)
png(
"sharp-shooter-%i.png",
width = 210L, height = 148L, units = "mm", res = 150L
)
# List the *-chunk.R files and source them. This would be more readable with
# pipes and purr::walk(), but unlike the main code chunks, this file
# intentionally sticks with base R.
invisible(lapply(list.files(pattern = "^[0-9]*-chunk.R$"), source, echo = TRUE))
dev.off()
save.image("sharp-shooter-session.rda")
# EOF
# Load the packages used. Install them first if needed.
library(tidyverse)
library(rstan)
library(DT)
library(knitr)
library(scales)
nhl <- read_csv("player-shots-and-goals-by-season.csv")
# How to obtain the raw data is not covered here. There are a few sites that
# host NHL stats, and the one I used was hockey-reference.com. In addition to
# the columns shown in the blog post, the data file contains an integer column
# player_id, because there are a few instances of two players with the same
# name.
nhl %>%
head(n = 10L) %>%
select(player, position, season, shots, goals) %>%
kable(format = "html", row.names = TRUE, format.args = list(big.mark = ","))
# These new columns will be used later.
nhl <-
nhl %>%
mutate(
year = season %>% sub(pattern = "^.*-", replacement = "") %>% as.integer(),
season_id = as.integer(as.factor(season)),
position_id = as.integer(as.factor(position))
)
players <-
nhl %>%
group_by(player_id) %>%
summarise(
player = first(player),
position_id = first(position_id),
position = first(position),
start = min(year) - 1L,
end = max(year),
mid = (start + end) / 2,
career =
paste0(start, "–", ifelse(end == max(nhl$year), yes = "", no = end)),
shots = sum(shots),
goals = sum(goals),
raw = goals / shots
)
players %>%
head(n = 10L) %>%
select(player, position, career, shots, goals, raw) %>%
mutate(
raw = ifelse(is.na(raw), yes = "", no = sprintf("%.2f%%", raw * 100))
) %>%
kable(format = "html", row.names = TRUE, align = "lllrrr",
format.args = list(big.mark = ",")
)
players %>%
ggplot(aes(raw, color = position)) +
geom_density() +
scale_x_continuous(labels = percent) +
labs(
subtitle = "Career shooting percentages per player position",
x = "raw career shooting percentage"
)
nhl %>%
group_by(year) %>%
summarise(average = sum(goals) / sum(shots)) %>%
ggplot(aes(year, average)) +
geom_point() +
scale_x_continuous(labels = function(x) paste0(x - 1L, "-", x)) +
scale_y_continuous(labels = percent, limits = c(0, 0.15)) +
labs(
subtitle = "Overall shooting percentages per season",
x = "season",
y = "overall league shooting percentage"
)
data {
int<lower=1> n_observations;
int<lower=1> n_seasons;
int<lower=1> n_players;
int<lower=1> n_positions;
int<lower=1,upper=n_seasons> season[n_observations];
int<lower=1,upper=n_players> player[n_observations];
int<lower=1,upper=n_positions> position[n_players];
int<lower=0> shots[n_observations];
int<lower=0> goals[n_observations];
}
parameters {
real league_mu;
real<lower=0> league_sigma;
vector[n_positions] position_mu;
vector<lower=0>[n_positions] position_sigma;
vector[n_players] skill;
vector[n_seasons-1] difficulty_raw;
}
transformed parameters {
vector[n_seasons] difficulty; // so that they sum up to zero
difficulty[1:(n_seasons-1)] = difficulty_raw;
difficulty[n_seasons] = -sum(difficulty_raw);
}
model {
league_mu ~ normal(-2, 1); // 95% probability to be between 2% and 50%
league_sigma ~ cauchy(0, 1);
position_mu ~ normal(league_mu, league_sigma);
position_sigma ~ cauchy(0, 1);
skill ~ normal(position_mu[position], position_sigma[position]);
difficulty_raw ~ normal(0, 1);
goals ~ binomial_logit(shots, skill[player] - difficulty[season]);
}
fit <-
stan("06-model.stan",
data = list(
n_observations = nrow(nhl),
n_seasons = max(nhl$season_id),
n_players = nrow(players),
n_positions = max(nhl$position_id),
season = nhl$season_id,
player = nhl$player_id,
position = players$position_id,
shots = nhl$shots,
goals = nhl$goals
),
seed = 20162017L
)
# Check diagnostic plots to make sure that chains converged.
# shinystan::launch_shinystan(fit)
posterior <- as.matrix(fit)
nhl <-
nhl %>%
mutate(
simulation = pmap_int(
list(shots, player_id, season_id),
~rbinom(
1L,
size = ..1,
prob = plogis(
posterior[, paste0("skill[", ..2, "]")] -
posterior[, paste0("difficulty[", ..3, "]")]
)
)
)
)
nhl %>%
group_by(player_id) %>%
summarise(
position = first(position),
actual = sum(goals) / sum(shots),
simulated = sum(simulation) / sum(shots)
) %>%
gather(key, value, actual, simulated) %>%
ggplot(aes(value, linetype = key, color = position)) +
geom_density() +
scale_x_continuous(labels = percent) +
scale_linetype_discrete(name = "goals") +
labs(
subtitle = paste("Simulated and actual career shooting percentages per",
"player position"
),
x = "raw career shooting percentage"
)
nhl %>%
group_by(year) %>%
summarise(
actual = sum(goals) / sum(shots),
simulated = sum(simulation) / sum(shots)
) %>%
gather(key, value, -year) %>%
ggplot(aes(year, value, color = key)) +
geom_point() +
scale_x_continuous(labels = function(x) paste0(x - 1L, "-", x)) +
scale_y_continuous(labels = percent, limits = c(0, 0.15)) +
scale_color_discrete(name = "goals") +
labs(
subtitle = "Simulated and actual overall shooting percentages per season",
x = "season",
y = "overall league shooting percentage"
)
ggplot(nhl, aes(goals, simulation)) +
geom_jitter(alpha = 0.1) +
geom_abline(slope = 1, intercept = 0) +
coord_cartesian(xlim = c(0, 100), ylim = c(0, 100)) +
labs(
subtitle = "Simulated and actual goals scored per player per season",
x = "goals scored (actual)",
y = "goals scored (simulated)"
)
players <-
players %>%
left_join(
posterior[, grep("^skill", colnames(posterior), value = TRUE)] %>%
apply(2L, median) %>%
tibble(
player_id = names(.) %>%
gsub(pattern = "[^0-9]", replacement = "") %>%
as.integer(),
model = .
) %>%
mutate(model = plogis(model)),
by = "player_id"
) %>%
arrange(desc(model))
players %>%
ggplot(aes(raw, model, color = mid)) +
geom_point(alpha = 0.1) +
scale_x_continuous(labels = percent) +
scale_y_continuous(labels = percent) +
scale_color_continuous("career midpoint") +
labs(
subtitle = "Raw career shooting percentages and modeled shooting skills",
x = "raw career shooting percentage",
y = "modeled shooting skill"
)
players %>%
head(n = 10L) %>%
select(player, position, career, shots, goals, raw, model) %>%
mutate(
raw = sprintf("%.2f%%", raw * 100),
model = sprintf("%.2f%%", model * 100)
) %>%
kable(format = "html", row.names = TRUE, align = "lllrrrr",
format.args = list(big.mark = ",")
)
players %>%
filter(position == "defenceman") %>%
head(n = 10L) %>%
select(player, position, career, shots, goals, raw, model) %>%
mutate(
raw = sprintf("%.2f%%", raw * 100),
model = sprintf("%.2f%%", model * 100)
) %>%
kable(format = "html", row.names = TRUE, align = "lllrrrr",
format.args = list(big.mark = ",")
)
players %>%
select(player, position, career, shots, goals, raw, model) %>%
datatable(filter = "top") %>%
formatCurrency("shots", currency = "", mark = ",", digits = 0L) %>%
formatPercentage("raw", digits = 2L) %>%
formatPercentage("model", digits = 2L) %>%
saveWidget("sharp-shooter.html")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment