Last active
September 3, 2020 15:11
-
-
Save ilarischeinin/7ef934c00033d3fc0803e27324eb110a to your computer and use it in GitHub Desktop.
Ranking NHL’s best shooters with Bayesian multilevel modeling - https://ilari.scheinin.fi/ranking-nhls-best-shooters/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Ranking NHL’s best shooters with Bayesian multilevel modeling | |
| # https://ilari.scheinin.fi/ranking-nhls-best-shooters/ | |
| # Ilari Scheinin | |
| # firstname.lastname@gmail.com | |
| # Code: MIT Licence | |
| # Figures and tables: CC-BY | |
| # For inclusion in the blog post, the main code is split into snippets in | |
| # separate files named like *-chunk.R. | |
| # This file exists purely for the boilerplate above, and for running the entire | |
| # analysis from the command line: | |
| # Rscript 00-sharp-shooter.R | |
| set.seed(20162017L) | |
| png( | |
| "sharp-shooter-%i.png", | |
| width = 210L, height = 148L, units = "mm", res = 150L | |
| ) | |
| # List the *-chunk.R files and source them. This would be more readable with | |
| # pipes and purr::walk(), but unlike the main code chunks, this file | |
| # intentionally sticks with base R. | |
| invisible(lapply(list.files(pattern = "^[0-9]*-chunk.R$"), source, echo = TRUE)) | |
| dev.off() | |
| save.image("sharp-shooter-session.rda") | |
| # EOF |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Load the packages used. Install them first if needed. | |
| library(tidyverse) | |
| library(rstan) | |
| library(DT) | |
| library(knitr) | |
| library(scales) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| nhl <- read_csv("player-shots-and-goals-by-season.csv") | |
| # How to obtain the raw data is not covered here. There are a few sites that | |
| # host NHL stats, and the one I used was hockey-reference.com. In addition to | |
| # the columns shown in the blog post, the data file contains an integer column | |
| # player_id, because there are a few instances of two players with the same | |
| # name. | |
| nhl %>% | |
| head(n = 10L) %>% | |
| select(player, position, season, shots, goals) %>% | |
| kable(format = "html", row.names = TRUE, format.args = list(big.mark = ",")) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # These new columns will be used later. | |
| nhl <- | |
| nhl %>% | |
| mutate( | |
| year = season %>% sub(pattern = "^.*-", replacement = "") %>% as.integer(), | |
| season_id = as.integer(as.factor(season)), | |
| position_id = as.integer(as.factor(position)) | |
| ) | |
| players <- | |
| nhl %>% | |
| group_by(player_id) %>% | |
| summarise( | |
| player = first(player), | |
| position_id = first(position_id), | |
| position = first(position), | |
| start = min(year) - 1L, | |
| end = max(year), | |
| mid = (start + end) / 2, | |
| career = | |
| paste0(start, "–", ifelse(end == max(nhl$year), yes = "", no = end)), | |
| shots = sum(shots), | |
| goals = sum(goals), | |
| raw = goals / shots | |
| ) | |
| players %>% | |
| head(n = 10L) %>% | |
| select(player, position, career, shots, goals, raw) %>% | |
| mutate( | |
| raw = ifelse(is.na(raw), yes = "", no = sprintf("%.2f%%", raw * 100)) | |
| ) %>% | |
| kable(format = "html", row.names = TRUE, align = "lllrrr", | |
| format.args = list(big.mark = ",") | |
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| players %>% | |
| ggplot(aes(raw, color = position)) + | |
| geom_density() + | |
| scale_x_continuous(labels = percent) + | |
| labs( | |
| subtitle = "Career shooting percentages per player position", | |
| x = "raw career shooting percentage" | |
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| nhl %>% | |
| group_by(year) %>% | |
| summarise(average = sum(goals) / sum(shots)) %>% | |
| ggplot(aes(year, average)) + | |
| geom_point() + | |
| scale_x_continuous(labels = function(x) paste0(x - 1L, "-", x)) + | |
| scale_y_continuous(labels = percent, limits = c(0, 0.15)) + | |
| labs( | |
| subtitle = "Overall shooting percentages per season", | |
| x = "season", | |
| y = "overall league shooting percentage" | |
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| data { | |
| int<lower=1> n_observations; | |
| int<lower=1> n_seasons; | |
| int<lower=1> n_players; | |
| int<lower=1> n_positions; | |
| int<lower=1,upper=n_seasons> season[n_observations]; | |
| int<lower=1,upper=n_players> player[n_observations]; | |
| int<lower=1,upper=n_positions> position[n_players]; | |
| int<lower=0> shots[n_observations]; | |
| int<lower=0> goals[n_observations]; | |
| } | |
| parameters { | |
| real league_mu; | |
| real<lower=0> league_sigma; | |
| vector[n_positions] position_mu; | |
| vector<lower=0>[n_positions] position_sigma; | |
| vector[n_players] skill; | |
| vector[n_seasons-1] difficulty_raw; | |
| } | |
| transformed parameters { | |
| vector[n_seasons] difficulty; // so that they sum up to zero | |
| difficulty[1:(n_seasons-1)] = difficulty_raw; | |
| difficulty[n_seasons] = -sum(difficulty_raw); | |
| } | |
| model { | |
| league_mu ~ normal(-2, 1); // 95% probability to be between 2% and 50% | |
| league_sigma ~ cauchy(0, 1); | |
| position_mu ~ normal(league_mu, league_sigma); | |
| position_sigma ~ cauchy(0, 1); | |
| skill ~ normal(position_mu[position], position_sigma[position]); | |
| difficulty_raw ~ normal(0, 1); | |
| goals ~ binomial_logit(shots, skill[player] - difficulty[season]); | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| fit <- | |
| stan("06-model.stan", | |
| data = list( | |
| n_observations = nrow(nhl), | |
| n_seasons = max(nhl$season_id), | |
| n_players = nrow(players), | |
| n_positions = max(nhl$position_id), | |
| season = nhl$season_id, | |
| player = nhl$player_id, | |
| position = players$position_id, | |
| shots = nhl$shots, | |
| goals = nhl$goals | |
| ), | |
| seed = 20162017L | |
| ) | |
| # Check diagnostic plots to make sure that chains converged. | |
| # shinystan::launch_shinystan(fit) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| posterior <- as.matrix(fit) | |
| nhl <- | |
| nhl %>% | |
| mutate( | |
| simulation = pmap_int( | |
| list(shots, player_id, season_id), | |
| ~rbinom( | |
| 1L, | |
| size = ..1, | |
| prob = plogis( | |
| posterior[, paste0("skill[", ..2, "]")] - | |
| posterior[, paste0("difficulty[", ..3, "]")] | |
| ) | |
| ) | |
| ) | |
| ) | |
| nhl %>% | |
| group_by(player_id) %>% | |
| summarise( | |
| position = first(position), | |
| actual = sum(goals) / sum(shots), | |
| simulated = sum(simulation) / sum(shots) | |
| ) %>% | |
| gather(key, value, actual, simulated) %>% | |
| ggplot(aes(value, linetype = key, color = position)) + | |
| geom_density() + | |
| scale_x_continuous(labels = percent) + | |
| scale_linetype_discrete(name = "goals") + | |
| labs( | |
| subtitle = paste("Simulated and actual career shooting percentages per", | |
| "player position" | |
| ), | |
| x = "raw career shooting percentage" | |
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| nhl %>% | |
| group_by(year) %>% | |
| summarise( | |
| actual = sum(goals) / sum(shots), | |
| simulated = sum(simulation) / sum(shots) | |
| ) %>% | |
| gather(key, value, -year) %>% | |
| ggplot(aes(year, value, color = key)) + | |
| geom_point() + | |
| scale_x_continuous(labels = function(x) paste0(x - 1L, "-", x)) + | |
| scale_y_continuous(labels = percent, limits = c(0, 0.15)) + | |
| scale_color_discrete(name = "goals") + | |
| labs( | |
| subtitle = "Simulated and actual overall shooting percentages per season", | |
| x = "season", | |
| y = "overall league shooting percentage" | |
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ggplot(nhl, aes(goals, simulation)) + | |
| geom_jitter(alpha = 0.1) + | |
| geom_abline(slope = 1, intercept = 0) + | |
| coord_cartesian(xlim = c(0, 100), ylim = c(0, 100)) + | |
| labs( | |
| subtitle = "Simulated and actual goals scored per player per season", | |
| x = "goals scored (actual)", | |
| y = "goals scored (simulated)" | |
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| players <- | |
| players %>% | |
| left_join( | |
| posterior[, grep("^skill", colnames(posterior), value = TRUE)] %>% | |
| apply(2L, median) %>% | |
| tibble( | |
| player_id = names(.) %>% | |
| gsub(pattern = "[^0-9]", replacement = "") %>% | |
| as.integer(), | |
| model = . | |
| ) %>% | |
| mutate(model = plogis(model)), | |
| by = "player_id" | |
| ) %>% | |
| arrange(desc(model)) | |
| players %>% | |
| ggplot(aes(raw, model, color = mid)) + | |
| geom_point(alpha = 0.1) + | |
| scale_x_continuous(labels = percent) + | |
| scale_y_continuous(labels = percent) + | |
| scale_color_continuous("career midpoint") + | |
| labs( | |
| subtitle = "Raw career shooting percentages and modeled shooting skills", | |
| x = "raw career shooting percentage", | |
| y = "modeled shooting skill" | |
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| players %>% | |
| head(n = 10L) %>% | |
| select(player, position, career, shots, goals, raw, model) %>% | |
| mutate( | |
| raw = sprintf("%.2f%%", raw * 100), | |
| model = sprintf("%.2f%%", model * 100) | |
| ) %>% | |
| kable(format = "html", row.names = TRUE, align = "lllrrrr", | |
| format.args = list(big.mark = ",") | |
| ) | |
| players %>% | |
| filter(position == "defenceman") %>% | |
| head(n = 10L) %>% | |
| select(player, position, career, shots, goals, raw, model) %>% | |
| mutate( | |
| raw = sprintf("%.2f%%", raw * 100), | |
| model = sprintf("%.2f%%", model * 100) | |
| ) %>% | |
| kable(format = "html", row.names = TRUE, align = "lllrrrr", | |
| format.args = list(big.mark = ",") | |
| ) | |
| players %>% | |
| select(player, position, career, shots, goals, raw, model) %>% | |
| datatable(filter = "top") %>% | |
| formatCurrency("shots", currency = "", mark = ",", digits = 0L) %>% | |
| formatPercentage("raw", digits = 2L) %>% | |
| formatPercentage("model", digits = 2L) %>% | |
| saveWidget("sharp-shooter.html") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment