Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
library(tidyverse)
library(xgboost)
library(magrittr)
library(dplyr)
library(Matrix)
library(na.tools)
library(ggimage)
library(nflfastR)
library(gt)
library(mgcv)
library(scales)
library(ggforce)
library(remotes)
library(ggtext)
source("https://raw.githubusercontent.com/mrcaseb/nflfastR/master/R/helper_add_nflscrapr_mutations.R")
source("https://raw.githubusercontent.com/mrcaseb/nflfastR/master/R/helper_add_ep_wp.R")
source("https://raw.githubusercontent.com/mrcaseb/nflfastR/master/R/helper_add_cp_cpoe.R")
seasons <- 2010:2020
pbp <- purrr::map_df(seasons, function(x) {
readRDS(
url(
glue::glue("https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/data/play_by_play_{x}.rds")
)
)
})
pbp_rp <- pbp %>%
filter(!is_na(epa), play_type=="no_play" | play_type=="pass" | play_type=="run")
pbp_rp <- pbp_rp %>%
mutate(
pass = if_else(str_detect(desc, "( pass)|(sacked)|(scramble)"), 1, 0),
rush = if_else(str_detect(desc, "(left end)|(left tackle)|(left guard)|(up the middle)|(right guard)|(right tackle)|(right end)") & pass == 0, 1, 0),
success = ifelse(epa>0, 1 , 0)
)
pbp_rp <- pbp_rp %>% filter(pass==1 | rush==1)
pbp_rp <- pbp_rp %>%
mutate(season = substr(old_game_id, 1, 4))
pbp_rp <- pbp_rp %>%
mutate(
posteam = case_when(
posteam == 'OAK' ~ 'LV',
posteam == 'SD' ~ 'LAC',
posteam == 'STL' ~ 'LA',
TRUE ~ posteam
)
)
pbp_rp <- pbp_rp %>%
mutate(
defteam = case_when(
defteam == 'OAK' ~ 'LV',
defteam == 'SD' ~ 'LAC',
defteam == 'STL' ~ 'LA',
TRUE ~ defteam
)
)
rush_attempts <- pbp_rp %>%
filter(rush_attempt == 1, qb_scramble == 0, qb_dropback == 0)
rush_attempts %>%
group_by(season, defteam) %>%
summarize(def_ypc = mean(yards_gained),
count = n()) %>%
filter(count >= 100) %>%
select(-count) -> def_ypc
rush_attempts <- rush_attempts %>%
left_join(def_ypc, by = c("season", "defteam"))
rush_attempts2 <- rush_attempts %>%
mutate(yards_rushed = case_when(yards_gained > 20 ~ 20L,
yards_gained < -5 ~ -5L,
TRUE ~ as.integer(yards_gained)),
label = yards_rushed + 5L)
rush_attempts3 <- rush_attempts2 %>%
mutate(run_left_end = if_else((run_gap == "end" & run_location == "left"), 1, 0),
run_left_guard = if_else((run_gap == "guard" & run_location == "left"), 1, 0),
run_left_tackle = if_else((run_gap == "tackle" & run_location == "left"), 1, 0),
run_right_end = if_else((run_gap == "end" & run_location == "right"), 1, 0),
run_right_guard = if_else((run_gap == "guard" & run_location == "right"), 1, 0),
run_right_tackle = if_else((run_gap == "tackle" & run_location == "right"), 1, 0),
run_middle = if_else((run_location == "middle"), 1, 0))
rush_attempts4 <- rush_attempts3 %>%
select(yardline_100, quarter_seconds_remaining, half_seconds_remaining,
game_seconds_remaining, qtr, down, goal_to_go, ydstogo, shotgun, no_huddle,
no_score_prob, ep, wp, def_ypc, label) %>%
filter(!is.na(label)) %>%
filter(!is.na(down))
nrounds <- 100
params <-
list(
booster = "gbtree",
objective = "multi:softprob",
eval_metric = c("mlogloss"),
num_class = 26,
eta = .025,
gamma = 2,
subsample=0.8,
colsample_bytree=0.8,
max_depth = 4,
min_child_weight = 1
)
smp_size <- floor(0.80 * nrow(rush_attempts4))
set.seed(123)
ind <- sample(seq_len(nrow(rush_attempts4)), size = smp_size)
ind_train <- rush_attempts4[ind, ]
ind_test <- rush_attempts4[-ind, ]
full_train <- xgboost::xgb.DMatrix(as.matrix(ind_train %>% select(-label)), label = as.integer(ind_train$label))
ryoe_model <- xgboost::xgboost(params = params, data = full_train, nrounds = nrounds, verbose = 2)
imp <- xgb.importance(colnames(ind_train), model = ryoe_model)
xgb.plot.importance(imp)
rushes_2020 <- rush_attempts3 %>%
filter(season == 2020) %>%
select(yardline_100, quarter_seconds_remaining, half_seconds_remaining,
game_seconds_remaining, qtr, down, goal_to_go, ydstogo, shotgun, no_huddle,
no_score_prob, ep, wp, def_ypc) %>%
mutate(index = 1:n())
ryoe_2020 <- stats::predict(ryoe_model,
as.matrix(rushes_2020 %>%
select(yardline_100, quarter_seconds_remaining, half_seconds_remaining,
game_seconds_remaining, qtr, down, goal_to_go, ydstogo, shotgun, no_huddle,
no_score_prob, ep, wp, def_ypc))) %>%
tibble::as_tibble() %>%
dplyr::rename(prob = "value") %>%
dplyr::bind_cols(purrr::map_dfr(seq_along(rushes_2020$index), function(x) {
tibble::tibble("xyds_rushed" = -5:20,
"down" = rushes_2020$down[[x]],
"yardline_100" = rushes_2020$yardline_100[[x]],
"quarter_seconds_remaining" = rushes_2020$quarter_seconds_remaining[[x]],
"half_seconds_remaining" = rushes_2020$half_seconds_remaining[[x]],
"game_seconds_remaining" = rushes_2020$game_seconds_remaining[[x]],
"qtr" = rushes_2020$qtr[[x]],
"goal_to_go" = rushes_2020$goal_to_go[[x]],
"ydstogo" = rushes_2020$ydstogo[[x]],
"shotgun" = rushes_2020$shotgun[[x]],
"no_huddle" = rushes_2020$no_huddle[[x]],
"no_score_prob" = rushes_2020$no_score_prob[[x]],
"ep" = rushes_2020$ep[[x]],
"wp" = rushes_2020$wp[[x]],
"index" = rushes_2020$index[[x]])
})) %>%
dplyr::group_by(.data$index) %>%
dplyr::mutate(max_loss = dplyr::if_else(.data$yardline_100 < 95, -5L, as.integer(.data$yardline_100 - 99L)),
max_gain = dplyr::if_else(.data$yardline_100 > 20, 20L, as.integer(.data$yardline_100)),
cum_prob = cumsum(.data$prob),
prob = dplyr::case_when(.data$xyds_rushed == .data$max_loss ~ .data$prob,
.data$xyds_rushed == .data$max_gain ~ 1 - dplyr::lag(.data$cum_prob, 1),
TRUE ~ .data$prob),
yardline_100 = .data$yardline_100 - .data$xyds_rushed) %>%
dplyr::filter(.data$xyds_rushed >= .data$max_loss, .data$xyds_rushed <= .data$max_gain) %>%
dplyr::select(-.data$cum_prob) %>%
dplyr::summarise(x_rush_yards = sum(.data$prob * .data$xyds_rushed)) %>%
ungroup()
rushes_2020_2 <- rushes_2020 %>%
inner_join(ryoe_2020)
pbp_2020 <- pbp_rp %>%
inner_join(rushes_2020_2) %>%
select(posteam, defteam, rusher_player_name, yards_gained, x_rush_yards, epa) %>%
mutate(ryoe = yards_gained - x_rush_yards)
rushers_2020 <- pbp_2020 %>%
filter(!is.na(rusher_player_name)) %>%
group_by(rusher_player_name, posteam) %>%
summarize(rushes = n(),
sum_ryoe = sum(ryoe, na.rm = T),
avg_ryoe = mean(ryoe, na.rm =T),
mean_epa = mean(epa, na.rm = T)) %>%
filter(rushes > 107) %>%
arrange(desc(avg_ryoe))
############################################################################
rushers_2020 <- rushers_2020 %>%
left_join(teams_colors_logos, by = c('posteam' = 'team_abbr'))
scatter_plot <- rushers_2020 %>%
ggplot() +
geom_smooth(aes(x = mean_epa, y = avg_ryoe), method = "lm", color = "grey") +
ggrepel::geom_text_repel(
aes(x = mean_epa, y = avg_ryoe, label = rusher_player_name),
box.padding = 0.3, size = 5
) +
geom_point(
aes(x = mean_epa, y = avg_ryoe, size = rushes, fill = team_color, color = team_color2),
shape = 21
) +
scale_color_identity(aesthetics = c("fill", "color")) +
scale_size(name = "Designed Rushes") +
theme_minimal() +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
labs(x = "EPA Per Rush",
y = "Rushing Yards Over Expected Per Rush",
title = "RYOE and EPA Are Correlated",
subtitle = "RYOE is a xgboost model, min. of 105 designed rushes",
caption = "By Tej Seth | @mfbanalytics | @deceptivespeed_") +
theme(
panel.grid.minor = element_blank(),
plot.title = element_text(face = "bold", size = 20, hjust = 0.5),
plot.subtitle = element_text(size = 10, hjust = 0.5),
axis.text = element_text(size = 14),
axis.title.y = element_text(size = 14)
)
scatter_plot
ggsave(
"ryoe-1.png", scatter_plot,
height = 10, width = 16, dpi = "retina"
)
rusher_faces<- pbp_2020 %>%
filter(!is.na(rusher_player_name)) %>%
group_by(rusher_player_name, posteam) %>%
summarize(rushes = n(),
sum_ryoe = sum(ryoe, na.rm = T),
avg_ryoe = mean(ryoe, na.rm =T),
mean_epa = mean(epa, na.rm = T)) %>%
filter(rushes > 75) %>%
arrange(desc(avg_ryoe))
write.csv(rusher_faces, "rusher_faces.csv")
rusher_faces <- read.csv("~/RYOE/rusher_faces.csv")
tab_data <- rusher_faces %>%
mutate(RK = as.integer(rank)) %>%
select(RK, rusher, headshot, mean_epa, avg_ryoe) %>%
arrange(RK)
tab_function <- function(data, ...){
data %>%
gt() %>%
text_transform(
locations = cells_body(vars(headshot)),
fn = function(x){
web_image(
url = x,
height = px(25)
)
}
) %>%
cols_label(
RK = "Rank",
rusher = "Rusher",
headshot = "",
mean_epa = "EPA/Rush",
avg_ryoe = "RYOE") %>%
data_color(
columns = vars(avg_ryoe),
colors = scales::col_numeric(
palette = c("#af8dc3", "#f7f7f7", "#7fbf7b"),
domain = c(-3, 2)
)
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body(
columns = vars(RK, rusher)
)
) %>%
tab_header(
title = "Top 30 RYOE Rushers",
subtitle = "RYOE = Rushing Yards Over Expected"
) %>%
tab_options(
column_labels.background.color = "white",
column_labels.font.weight = "bold",
table.border.top.width = px(3),
table.border.top.color = "transparent",
table.border.bottom.color = "transparent",
table.border.bottom.width = px(3),
column_labels.border.top.width = px(3),
column_labels.border.top.color = "transparent",
column_labels.border.bottom.width = px(3),
column_labels.border.bottom.color = "black",
data_row.padding = px(3),
source_notes.font.size = 12,
table.font.size = 16,
heading.align = "middle",
...
) %>%
opt_table_font(
font = list(
default_fonts()
)
)
}
gt_tab1 <- tab_data %>%
filter(RK < 31) %>%
tab_function()
gt_tab1
gtsave(gt_tab1, "gt-tab1.png")
gt_tab2 <- tab_data %>%
filter(RK >= 31) %>%
tab_function() %>%
tab_header(
title = "Rushers 31-60",
subtitle = "RYOE is a xgboost model made by Tej Seth (@mfbanalytics)"
) %>%
tab_style(
style = cell_borders(
sides = "left",
color = "black",
weight = px(3)
),
locations =
list(
cells_body(
columns = 1
),
cells_column_labels(1)
)
)
gt_tab2
gtsave(gt_tab2, "gt-tab2.png")
img1 <- magick::image_read("gt-tab1.png")
img2 <- magick::image_read("gt-tab2.png")
img3 <- magick::image_append(c(img1, img2))
img3
ggsave(img3, "ryoe-2.png")
rusher_faces2 <- read.csv("~/RYOE/rusher_faces.csv")
rusher_faces2 <- rusher_faces2 %>%
left_join(teams_colors_logos, by = c('posteam' = 'team_abbr'))
rusher_faces2 <- rusher_faces2 %>%
arrange(rank)
teams_xp <- pbp_2020 %>%
group_by(posteam) %>%
summarize(team_avg = mean(x_rush_yards, na.rm = T))
rusher_faces2 <- rusher_faces2 %>%
left_join(teams_xp)
rusher_faces2 <- rusher_faces2 %>%
mutate(yards_gained = avg_ryoe + team_avg,
new_rank = row_number())
rusher_faces2 %>%
ggplot() +
geom_link(
mapping = aes(x = team_avg, y = new_rank, xend = yards_gained, yend = new_rank, size = 2, color = team_color)
) +
theme_bw() +
scale_colour_identity() +
geom_image(aes(x = team_avg, y = new_rank, image = team_logo_espn), size = 0.04, asp = 16/9) +
geom_image(aes(x = yards_gained, y = new_rank, image = headshot), size = 0.04, asp = 16/9) +
labs(
x = "Rushing Yards Average",
y = "",
title = "Each Team's Most Used Rusher's RYOE",
subtitle = "RYOE = Rushing Yards Over Expected, if a player's face is to the right of their logo they have a positive RYOE",
caption = "By Tej Seth | @mfbanalytics | @deceptivespeed_"
) +
theme(
plot.title = element_markdown(hjust = 0.5, size = 20, face = "bold"),
plot.subtitle = element_markdown(hjust = 0.5, size = 12),
axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
legend.position = "none",
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.background = element_blank(),
panel.border= element_blank()
) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
scale_y_reverse(breaks = scales::pretty_breaks(n = 10))
ggsave('ryoe-3.png', dpi=300, height=9*.8, width=16*.8)
top_rushers <- pbp_2020 %>%
filter(!is.na(rusher_player_name)) %>%
group_by(rusher_player_name) %>%
summarize(rushes = n(),
sum_ryoe = sum(ryoe, na.rm = T)) %>%
filter(rushes > 107) %>%
arrange(desc(sum_ryoe)) %>%
filter(sum_ryoe > 0)
top_rushers <- top_rushers %>%
left_join(rusher_faces)
top_rushers <- top_rushers %>%
left_join(teams_colors_logos, by = c("posteam" = "team_abbr")) %>%
filter(!is.na(rank)) %>%
mutate(rank = row_number())
link_to_img <- function(x, width = 50) {
glue::glue("<img src='{x}' width='{width}'/>")
}
bar_plot <- top_rushers %>%
mutate(label = link_to_img(headshot),
rank = as.integer(rank)) %>%
ggplot() +
geom_col(
aes(
x = rank, y = sum_ryoe,
fill = team_color, color = team_color2
),
width = 0.4
) +
geom_image(aes(x = rank, y = sum_ryoe + 5 , image = headshot), asp = 16/9, size = 0.06) +
scale_color_identity(aesthetics = c("fill", "color")) +
theme_minimal() +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
labs(x = NULL,
y = "Total RYOE\n",
title = "The Running Backs 10 in Rushing Yards Over Expected",
caption = "By Tej Seth | @mfbanalytics") +
theme(
panel.grid.minor = element_blank(),
plot.title = element_text(face = "bold", size = 20, hjust = 0.5),
plot.subtitle = element_text(size = 10, hjust = 0.5),
axis.text = element_text(size = 14, face = "bold"),
axis.title.y = element_text(size = 16, face = "bold")
)
bar_plot
ggsave(
"ryoe-4.png", bar_plot,
height = 10, width = 16, dpi = "retina"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment