Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Ranking NHL’s best shooters with Bayesian multilevel modeling - https://ilari.scheinin.fi/ranking-nhls-best-shooters/
# Ranking NHL’s best shooters with Bayesian multilevel modeling
# https://ilari.scheinin.fi/ranking-nhls-best-shooters/
# Ilari Scheinin
# firstname.lastname@gmail.com
# Code: MIT Licence
# Figures and tables: CC-BY
set.seed(20162017L)
png("sharp-shooter-%i.png", width=210L, height=148L, units="mm", res=150L)
invisible(lapply(list.files(pattern="^[0-9]*-chunk.R"), source, echo=TRUE))
dev.off()
save.image("sharp-shooter-session.rda")
# EOF
# Load the packages used. Install them first if needed.
library(tidyverse)
library(rstan)
library(DT)
library(knitr)
library(scales)
nhl <- read_csv("player-shots-and-goals-by-season.csv")
# How to obtain the raw data is not covered here. There are a few sites that
# host NHL stats, and the one I used was hockey-reference.com. In addition to
# the columns shown in the blog post, the data file contains an integer column
# player_id, because there are a few instances of two players with the same
# name.
nhl %>%
head(n=10L) %>%
select(player, position, season, shots, goals) %>%
kable(format="html", row.names=TRUE, format.args=list(big.mark=","))
# These new columns will be used later.
nhl <- nhl %>%
mutate(
year=season %>% sub(pattern="^.*-", replacement="") %>% as.integer(),
season_id=as.integer(as.factor(season)),
position_id=as.integer(as.factor(position))
)
players <- nhl %>%
group_by(player_id) %>%
summarise(
player=first(player),
position_id=first(position_id),
position=first(position),
start=min(year) - 1L,
end=max(year),
mid=(start + end) / 2,
career=paste0(start, "", ifelse(end == max(nhl$year), yes="", no=end)),
shots=sum(shots),
goals=sum(goals),
raw=goals / shots
)
players %>%
head(n=10L) %>%
select(player, position, career, shots, goals, raw) %>%
mutate(
raw=ifelse(is.na(raw), yes="", no=sprintf("%.2f%%", raw * 100))
) %>%
kable(format="html", row.names=TRUE, align="lllrrr",
format.args=list(big.mark=",")
)
players %>%
ggplot(aes(raw, color=position)) +
geom_density() +
scale_x_continuous(labels=percent) +
labs(
subtitle="Career shooting percentages per player position",
x="raw career shooting percentage"
)
nhl %>%
group_by(year) %>%
summarise(average=sum(goals) / sum(shots)) %>%
ggplot(aes(year, average)) +
geom_point() +
scale_x_continuous(labels=function(x) paste0(x - 1L, "-", x)) +
scale_y_continuous(labels=percent, limits=c(0, 0.15)) +
labs(
subtitle="Overall shooting percentages per season",
x="season",
y="overall league shooting percentage"
)
data {
int<lower=1> n_observations;
int<lower=1> n_seasons;
int<lower=1> n_players;
int<lower=1> n_positions;
int<lower=1,upper=n_seasons> season[n_observations];
int<lower=1,upper=n_players> player[n_observations];
int<lower=1,upper=n_positions> position[n_players];
int<lower=0> shots[n_observations];
int<lower=0> goals[n_observations];
}
parameters {
real league_mu;
real<lower=0> league_sigma;
vector[n_positions] position_mu;
vector<lower=0>[n_positions] position_sigma;
vector[n_players] skill;
vector[n_seasons-1] difficulty_raw;
}
transformed parameters {
vector[n_seasons] difficulty; // so that they sum up to zero
difficulty[1:(n_seasons-1)] = difficulty_raw;
difficulty[n_seasons] = -sum(difficulty_raw);
}
model {
league_mu ~ normal(-2, 1); // 95% probability to be between 2% and 50%
league_sigma ~ cauchy(0, 1);
position_mu ~ normal(league_mu, league_sigma);
position_sigma ~ cauchy(0, 1);
skill ~ normal(position_mu[position], position_sigma[position]);
difficulty_raw ~ normal(0, 1);
goals ~ binomial_logit(shots, skill[player] - difficulty[season]);
}
fit <- stan("06-model.stan",
data=list(
n_observations=nrow(nhl),
n_seasons=max(nhl$season_id),
n_players=nrow(players),
n_positions=max(nhl$position_id),
season=nhl$season_id,
player=nhl$player_id,
position=players$position_id,
shots=nhl$shots,
goals=nhl$goals
),
seed=20162017L
)
# Check diagnostic plots to make sure that chains converged.
# shinystan::launch_shinystan(fit)
posterior <- as.matrix(fit)
nhl$simulation <-
sapply(seq_len(nrow(nhl)), function(i) rbinom(n=1L, size=nhl$shots[i],
prob=plogis(
posterior[, paste0("skill[", nhl$player_id[i], "]")] -
posterior[, paste0("difficulty[", nhl$season_id[i], "]")]
)
))
nhl %>%
group_by(player_id) %>%
summarise(
position=first(position),
actual=sum(goals) / sum(shots),
simulated=sum(simulation) / sum(shots)
) %>%
gather(key, value, actual, simulated) %>%
ggplot(aes(value, linetype=key, color=position)) +
geom_density() +
scale_x_continuous(labels=percent) +
scale_linetype_discrete(name="goals") +
labs(
subtitle=paste("Simulated and actual career shooting percentages per",
"player position"
),
x="raw career shooting percentage"
)
nhl %>%
group_by(year) %>%
summarise(
actual=sum(goals) / sum(shots),
simulated=sum(simulation) / sum(shots)
) %>%
gather(key, value, -year) %>%
ggplot(aes(year, value, color=key)) +
geom_point() +
scale_x_continuous(labels=function(x) paste0(x - 1L, "-", x)) +
scale_y_continuous(labels=percent, limits=c(0, 0.15)) +
scale_color_discrete(name="goals") +
labs(
subtitle="Simulated and actual overall shooting percentages per season",
x="season",
y="overall league shooting percentage"
)
ggplot(nhl, aes(goals, simulation)) +
geom_jitter(alpha=0.1) +
geom_abline(slope=1, intercept=0) +
coord_cartesian(xlim=c(0, 100), ylim=c(0, 100)) +
labs(
subtitle="Simulated and actual goals scored per player per season",
x="goals scored (actual)",
y="goals scored (simulated)"
)
players <- players %>%
left_join(
posterior[, grep("^skill", colnames(posterior), value=TRUE)] %>%
apply(2L, median) %>%
data_frame(
player_id=names(.) %>% gsub(pattern="[^0-9]", replacement="") %>%
as.integer(),
model=.
) %>%
mutate(model=plogis(model))
) %>%
arrange(desc(model))
players %>%
ggplot(aes(raw, model, color=mid)) +
geom_point(alpha=0.1) +
scale_x_continuous(labels=percent) +
scale_y_continuous(labels=percent) +
scale_color_continuous("career midpoint") +
labs(
subtitle="Raw career shooting percentages and modeled shooting skills",
x="raw career shooting percentage",
y="modeled shooting skill"
)
players %>%
head(n=10L) %>%
select(player, position, career, shots, goals, raw, model) %>%
mutate(
raw=sprintf("%.2f%%", raw * 100),
model=sprintf("%.2f%%", model * 100)
) %>%
kable(format="html", row.names=TRUE, align="lllrrrr",
format.args=list(big.mark=",")
)
players %>%
filter(position == "defenceman") %>%
head(n=10L) %>%
select(player, position, career, shots, goals, raw, model) %>%
mutate(
raw=sprintf("%.2f%%", raw * 100),
model=sprintf("%.2f%%", model * 100)
) %>%
kable(format="html", row.names=TRUE, align="lllrrrr",
format.args=list(big.mark=",")
)
players %>%
select(player, position, career, shots, goals, raw, model) %>%
datatable(filter="top") %>%
formatCurrency("shots", currency="", mark=",", digits=0L) %>%
formatPercentage("raw", digits=2L) %>%
formatPercentage("model", digits=2L) %>%
saveWidget("sharp-shooter.html")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment