ilarischeinin/00-sharp-shooter.R

## 00-sharp-shooter.R
# Ranking NHL’s best shooters with Bayesian multilevel modeling
# https://ilari.scheinin.fi/ranking-nhls-best-shooters/
# Ilari Scheinin
# firstname.lastname@gmail.com
# Code: MIT Licence
# Figures and tables: CC-BY

# For inclusion in the blog post, the main code is split into snippets in
# separate files named like *-chunk.R.
# This file exists purely for the boilerplate above, and for running the entire
# analysis from the command line:
# Rscript 00-sharp-shooter.R

set.seed(20162017L)

png(
  "sharp-shooter-%i.png",
  width = 210L, height = 148L, units = "mm", res = 150L
)

# List the *-chunk.R files and source them. This would be more readable with
# pipes and purr::walk(), but unlike the main code chunks, this file
# intentionally sticks with base R.
invisible(lapply(list.files(pattern = "^[0-9]*-chunk.R$"), source, echo = TRUE))

dev.off()

save.image("sharp-shooter-session.rda")

# EOF

## 01-chunk.R
# Load the packages used. Install them first if needed.
library(tidyverse)
library(rstan)
library(DT)
library(knitr)
library(scales)

## 02-chunk.R
nhl <- read_csv("player-shots-and-goals-by-season.csv")
# How to obtain the raw data is not covered here. There are a few sites that
# host NHL stats, and the one I used was hockey-reference.com. In addition to
# the columns shown in the blog post, the data file contains an integer column
# player_id, because there are a few instances of two players with the same
# name.

nhl %>%
  head(n = 10L) %>%
  select(player, position, season, shots, goals) %>%
  kable(format = "html", row.names = TRUE, format.args = list(big.mark = ","))

## 03-chunk.R
# These new columns will be used later.
nhl <-
  nhl %>%
  mutate(
    year = season %>% sub(pattern = "^.*-", replacement = "") %>% as.integer(),
    season_id = as.integer(as.factor(season)),
    position_id = as.integer(as.factor(position))
  )

players <-
  nhl %>%
  group_by(player_id) %>%
  summarise(
    player = first(player),
    position_id = first(position_id),
    position = first(position),
    start = min(year) - 1L,
    end = max(year),
    mid = (start + end) / 2,
    career =
      paste0(start, "–", ifelse(end == max(nhl$year), yes = "", no = end)),
    shots = sum(shots),
    goals = sum(goals),
    raw = goals / shots
  )

players %>%
  head(n = 10L) %>%
  select(player, position, career, shots, goals, raw) %>%
  mutate(
    raw = ifelse(is.na(raw), yes = "", no = sprintf("%.2f%%", raw * 100))
  ) %>%
  kable(format = "html", row.names = TRUE, align = "lllrrr",
    format.args = list(big.mark = ",")
  )

## 04-chunk.R
players %>%
  ggplot(aes(raw, color = position)) +
  geom_density() +
  scale_x_continuous(labels = percent) +
  labs(
    subtitle = "Career shooting percentages per player position",
    x = "raw career shooting percentage"
  )

## 05-chunk.R
nhl %>%
  group_by(year) %>%
  summarise(average = sum(goals) / sum(shots)) %>%
  ggplot(aes(year, average)) +
  geom_point() +
  scale_x_continuous(labels = function(x) paste0(x - 1L, "-", x)) +
  scale_y_continuous(labels = percent, limits = c(0, 0.15)) +
  labs(
    subtitle = "Overall shooting percentages per season",
    x = "season",
    y = "overall league shooting percentage"
  )

## 06-model.stan
data {
  int<lower=1> n_observations;
  int<lower=1> n_seasons;
  int<lower=1> n_players;
  int<lower=1> n_positions;
  int<lower=1,upper=n_seasons> season[n_observations];
  int<lower=1,upper=n_players> player[n_observations];
  int<lower=1,upper=n_positions> position[n_players];
  int<lower=0> shots[n_observations];
  int<lower=0> goals[n_observations];
}
parameters {
  real league_mu;
  real<lower=0> league_sigma;
  vector[n_positions] position_mu;
  vector<lower=0>[n_positions] position_sigma;
  vector[n_players] skill;
  vector[n_seasons-1] difficulty_raw;
}
transformed parameters {
  vector[n_seasons] difficulty; // so that they sum up to zero
  difficulty[1:(n_seasons-1)] = difficulty_raw;
  difficulty[n_seasons] = -sum(difficulty_raw);
}
model {
  league_mu ~ normal(-2, 1); // 95% probability to be between 2% and 50%
  league_sigma ~ cauchy(0, 1);
  position_mu ~ normal(league_mu, league_sigma);
  position_sigma ~ cauchy(0, 1);
  skill ~ normal(position_mu[position], position_sigma[position]);
  difficulty_raw ~ normal(0, 1);
  goals ~ binomial_logit(shots, skill[player] - difficulty[season]);
}

## 07-chunk.R
fit <-
  stan("06-model.stan",
    data = list(
      n_observations = nrow(nhl),
      n_seasons = max(nhl$season_id),
      n_players = nrow(players),
      n_positions = max(nhl$position_id),
      season = nhl$season_id,
      player = nhl$player_id,
      position = players$position_id,
      shots = nhl$shots,
      goals = nhl$goals
    ),
    seed = 20162017L
  )

# Check diagnostic plots to make sure that chains converged.
# shinystan::launch_shinystan(fit)

## 08-chunk.R
posterior <- as.matrix(fit)

nhl <-
  nhl %>%
  mutate(
    simulation = pmap_int(
      list(shots, player_id, season_id),
      ~rbinom(
        1L,
        size = ..1,
        prob = plogis(
          posterior[, paste0("skill[", ..2, "]")] -
            posterior[, paste0("difficulty[", ..3, "]")]
        )
      )
    )
  )

nhl %>%
  group_by(player_id) %>%
  summarise(
    position = first(position),
    actual = sum(goals) / sum(shots),
    simulated = sum(simulation) / sum(shots)
  ) %>%
  gather(key, value, actual, simulated) %>%
  ggplot(aes(value, linetype = key, color = position)) +
  geom_density() +
  scale_x_continuous(labels = percent) +
  scale_linetype_discrete(name = "goals") +
  labs(
    subtitle = paste("Simulated and actual career shooting percentages per",
      "player position"
    ),
    x = "raw career shooting percentage"
  )

## 09-chunk.R
nhl %>%
  group_by(year) %>%
  summarise(
    actual = sum(goals) / sum(shots),
    simulated = sum(simulation) / sum(shots)
  ) %>%
  gather(key, value, -year) %>%
  ggplot(aes(year, value, color = key)) +
  geom_point() +
  scale_x_continuous(labels = function(x) paste0(x - 1L, "-", x)) +
  scale_y_continuous(labels = percent, limits = c(0, 0.15)) +
  scale_color_discrete(name = "goals") +
  labs(
    subtitle = "Simulated and actual overall shooting percentages per season",
    x = "season",
    y = "overall league shooting percentage"
  )

## 10-chunk.R
ggplot(nhl, aes(goals, simulation)) +
  geom_jitter(alpha = 0.1) +
  geom_abline(slope = 1, intercept = 0) +
  coord_cartesian(xlim = c(0, 100), ylim = c(0, 100)) +
  labs(
    subtitle = "Simulated and actual goals scored per player per season",
    x = "goals scored (actual)",
    y = "goals scored (simulated)"
  )

## 11-chunk.R
players <-
  players %>%
  left_join(
    posterior[, grep("^skill", colnames(posterior), value = TRUE)] %>%
      apply(2L, median) %>%
      tibble(
        player_id = names(.) %>%
          gsub(pattern = "[^0-9]", replacement = "") %>%
          as.integer(),
        model = .
      ) %>%
      mutate(model = plogis(model)),
    by = "player_id"
  ) %>%
  arrange(desc(model))

players %>%
  ggplot(aes(raw, model, color = mid)) +
  geom_point(alpha = 0.1) +
  scale_x_continuous(labels = percent) +
  scale_y_continuous(labels = percent) +
  scale_color_continuous("career midpoint") +
  labs(
    subtitle = "Raw career shooting percentages and modeled shooting skills",
    x = "raw career shooting percentage",
    y = "modeled shooting skill"
  )

## 12-chunk.R
players %>%
  head(n = 10L) %>%
  select(player, position, career, shots, goals, raw, model) %>%
  mutate(
    raw = sprintf("%.2f%%", raw * 100),
    model = sprintf("%.2f%%", model * 100)
  ) %>%
  kable(format = "html", row.names = TRUE, align = "lllrrrr",
    format.args = list(big.mark = ",")
  )

players %>%
  filter(position == "defenceman") %>%
  head(n = 10L) %>%
  select(player, position, career, shots, goals, raw, model) %>%
  mutate(
    raw = sprintf("%.2f%%", raw * 100),
    model = sprintf("%.2f%%", model * 100)
  ) %>%
  kable(format = "html", row.names = TRUE, align = "lllrrrr",
    format.args = list(big.mark = ",")
  )

players %>%
  select(player, position, career, shots, goals, raw, model) %>%
  datatable(filter = "top") %>%
  formatCurrency("shots", currency = "", mark = ",", digits = 0L) %>%
  formatPercentage("raw", digits = 2L) %>%
  formatPercentage("model", digits = 2L) %>%
  saveWidget("sharp-shooter.html")
	# Ranking NHL’s best shooters with Bayesian multilevel modeling
	# https://ilari.scheinin.fi/ranking-nhls-best-shooters/
	# Ilari Scheinin
	# firstname.lastname@gmail.com
	# Code: MIT Licence
	# Figures and tables: CC-BY

	# For inclusion in the blog post, the main code is split into snippets in
	# separate files named like *-chunk.R.
	# This file exists purely for the boilerplate above, and for running the entire
	# analysis from the command line:
	# Rscript 00-sharp-shooter.R

	set.seed(20162017L)

	png(
	"sharp-shooter-%i.png",
	width = 210L, height = 148L, units = "mm", res = 150L
	)

	# List the *-chunk.R files and source them. This would be more readable with
	# pipes and purr::walk(), but unlike the main code chunks, this file
	# intentionally sticks with base R.
	invisible(lapply(list.files(pattern = "^[0-9]*-chunk.R$"), source, echo = TRUE))

	dev.off()

	save.image("sharp-shooter-session.rda")

	# EOF
	# Load the packages used. Install them first if needed.
	library(tidyverse)
	library(rstan)
	library(DT)
	library(knitr)
	library(scales)
	nhl <- read_csv("player-shots-and-goals-by-season.csv")
	# How to obtain the raw data is not covered here. There are a few sites that
	# host NHL stats, and the one I used was hockey-reference.com. In addition to
	# the columns shown in the blog post, the data file contains an integer column
	# player_id, because there are a few instances of two players with the same
	# name.

	nhl %>%
	head(n = 10L) %>%
	select(player, position, season, shots, goals) %>%
	kable(format = "html", row.names = TRUE, format.args = list(big.mark = ","))
	# These new columns will be used later.
	nhl <-
	nhl %>%
	mutate(
	year = season %>% sub(pattern = "^.*-", replacement = "") %>% as.integer(),
	season_id = as.integer(as.factor(season)),
	position_id = as.integer(as.factor(position))
	)

	players <-
	nhl %>%
	group_by(player_id) %>%
	summarise(
	player = first(player),
	position_id = first(position_id),
	position = first(position),
	start = min(year) - 1L,
	end = max(year),
	mid = (start + end) / 2,
	career =
	paste0(start, "–", ifelse(end == max(nhl$year), yes = "", no = end)),
	shots = sum(shots),
	goals = sum(goals),
	raw = goals / shots
	)

	players %>%
	head(n = 10L) %>%
	select(player, position, career, shots, goals, raw) %>%
	mutate(
	raw = ifelse(is.na(raw), yes = "", no = sprintf("%.2f%%", raw * 100))
	) %>%
	kable(format = "html", row.names = TRUE, align = "lllrrr",
	format.args = list(big.mark = ",")
	)
	players %>%
	ggplot(aes(raw, color = position)) +
	geom_density() +
	scale_x_continuous(labels = percent) +
	labs(
	subtitle = "Career shooting percentages per player position",
	x = "raw career shooting percentage"
	)
	nhl %>%
	group_by(year) %>%
	summarise(average = sum(goals) / sum(shots)) %>%
	ggplot(aes(year, average)) +
	geom_point() +
	scale_x_continuous(labels = function(x) paste0(x - 1L, "-", x)) +
	scale_y_continuous(labels = percent, limits = c(0, 0.15)) +
	labs(
	subtitle = "Overall shooting percentages per season",
	x = "season",
	y = "overall league shooting percentage"
	)
	data {
	int<lower=1> n_observations;
	int<lower=1> n_seasons;
	int<lower=1> n_players;
	int<lower=1> n_positions;
	int<lower=1,upper=n_seasons> season[n_observations];
	int<lower=1,upper=n_players> player[n_observations];
	int<lower=1,upper=n_positions> position[n_players];
	int<lower=0> shots[n_observations];
	int<lower=0> goals[n_observations];
	}
	parameters {
	real league_mu;
	real<lower=0> league_sigma;
	vector[n_positions] position_mu;
	vector<lower=0>[n_positions] position_sigma;
	vector[n_players] skill;
	vector[n_seasons-1] difficulty_raw;
	}
	transformed parameters {
	vector[n_seasons] difficulty; // so that they sum up to zero
	difficulty[1:(n_seasons-1)] = difficulty_raw;
	difficulty[n_seasons] = -sum(difficulty_raw);
	}
	model {
	league_mu ~ normal(-2, 1); // 95% probability to be between 2% and 50%
	league_sigma ~ cauchy(0, 1);
	position_mu ~ normal(league_mu, league_sigma);
	position_sigma ~ cauchy(0, 1);
	skill ~ normal(position_mu[position], position_sigma[position]);
	difficulty_raw ~ normal(0, 1);
	goals ~ binomial_logit(shots, skill[player] - difficulty[season]);
	}
	fit <-
	stan("06-model.stan",
	data = list(
	n_observations = nrow(nhl),
	n_seasons = max(nhl$season_id),
	n_players = nrow(players),
	n_positions = max(nhl$position_id),
	season = nhl$season_id,
	player = nhl$player_id,
	position = players$position_id,
	shots = nhl$shots,
	goals = nhl$goals
	),
	seed = 20162017L
	)

	# Check diagnostic plots to make sure that chains converged.
	# shinystan::launch_shinystan(fit)
	posterior <- as.matrix(fit)

	nhl <-
	nhl %>%
	mutate(
	simulation = pmap_int(
	list(shots, player_id, season_id),
	~rbinom(
	1L,
	size = ..1,
	prob = plogis(
	posterior[, paste0("skill[", ..2, "]")] -
	posterior[, paste0("difficulty[", ..3, "]")]
	)
	)
	)
	)

	nhl %>%
	group_by(player_id) %>%
	summarise(
	position = first(position),
	actual = sum(goals) / sum(shots),
	simulated = sum(simulation) / sum(shots)
	) %>%
	gather(key, value, actual, simulated) %>%
	ggplot(aes(value, linetype = key, color = position)) +
	geom_density() +
	scale_x_continuous(labels = percent) +
	scale_linetype_discrete(name = "goals") +
	labs(
	subtitle = paste("Simulated and actual career shooting percentages per",
	"player position"
	),
	x = "raw career shooting percentage"
	)
	ggplot(nhl, aes(goals, simulation)) +
	geom_jitter(alpha = 0.1) +
	geom_abline(slope = 1, intercept = 0) +
	coord_cartesian(xlim = c(0, 100), ylim = c(0, 100)) +
	labs(
	subtitle = "Simulated and actual goals scored per player per season",
	x = "goals scored (actual)",
	y = "goals scored (simulated)"
	)