This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(xgboost) | |
library(magrittr) | |
library(dplyr) | |
library(Matrix) | |
library(na.tools) | |
library(ggimage) | |
library(nflfastR) | |
library(gt) | |
library(mgcv) | |
library(scales) | |
library(ggforce) | |
library(remotes) | |
library(ggtext) | |
source("https://raw.githubusercontent.com/mrcaseb/nflfastR/master/R/helper_add_nflscrapr_mutations.R") | |
source("https://raw.githubusercontent.com/mrcaseb/nflfastR/master/R/helper_add_ep_wp.R") | |
source("https://raw.githubusercontent.com/mrcaseb/nflfastR/master/R/helper_add_cp_cpoe.R") | |
seasons <- 2010:2020 | |
pbp <- purrr::map_df(seasons, function(x) { | |
readRDS( | |
url( | |
glue::glue("https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/data/play_by_play_{x}.rds") | |
) | |
) | |
}) | |
pbp_rp <- pbp %>% | |
filter(!is_na(epa), play_type=="no_play" | play_type=="pass" | play_type=="run") | |
pbp_rp <- pbp_rp %>% | |
mutate( | |
pass = if_else(str_detect(desc, "( pass)|(sacked)|(scramble)"), 1, 0), | |
rush = if_else(str_detect(desc, "(left end)|(left tackle)|(left guard)|(up the middle)|(right guard)|(right tackle)|(right end)") & pass == 0, 1, 0), | |
success = ifelse(epa>0, 1 , 0) | |
) | |
pbp_rp <- pbp_rp %>% filter(pass==1 | rush==1) | |
pbp_rp <- pbp_rp %>% | |
mutate(season = substr(old_game_id, 1, 4)) | |
pbp_rp <- pbp_rp %>% | |
mutate( | |
posteam = case_when( | |
posteam == 'OAK' ~ 'LV', | |
posteam == 'SD' ~ 'LAC', | |
posteam == 'STL' ~ 'LA', | |
TRUE ~ posteam | |
) | |
) | |
pbp_rp <- pbp_rp %>% | |
mutate( | |
defteam = case_when( | |
defteam == 'OAK' ~ 'LV', | |
defteam == 'SD' ~ 'LAC', | |
defteam == 'STL' ~ 'LA', | |
TRUE ~ defteam | |
) | |
) | |
rush_attempts <- pbp_rp %>% | |
filter(rush_attempt == 1, qb_scramble == 0, qb_dropback == 0) | |
rush_attempts %>% | |
group_by(season, defteam) %>% | |
summarize(def_ypc = mean(yards_gained), | |
count = n()) %>% | |
filter(count >= 100) %>% | |
select(-count) -> def_ypc | |
rush_attempts <- rush_attempts %>% | |
left_join(def_ypc, by = c("season", "defteam")) | |
rush_attempts2 <- rush_attempts %>% | |
mutate(yards_rushed = case_when(yards_gained > 20 ~ 20L, | |
yards_gained < -5 ~ -5L, | |
TRUE ~ as.integer(yards_gained)), | |
label = yards_rushed + 5L) | |
rush_attempts3 <- rush_attempts2 %>% | |
mutate(run_left_end = if_else((run_gap == "end" & run_location == "left"), 1, 0), | |
run_left_guard = if_else((run_gap == "guard" & run_location == "left"), 1, 0), | |
run_left_tackle = if_else((run_gap == "tackle" & run_location == "left"), 1, 0), | |
run_right_end = if_else((run_gap == "end" & run_location == "right"), 1, 0), | |
run_right_guard = if_else((run_gap == "guard" & run_location == "right"), 1, 0), | |
run_right_tackle = if_else((run_gap == "tackle" & run_location == "right"), 1, 0), | |
run_middle = if_else((run_location == "middle"), 1, 0)) | |
rush_attempts4 <- rush_attempts3 %>% | |
select(yardline_100, quarter_seconds_remaining, half_seconds_remaining, | |
game_seconds_remaining, qtr, down, goal_to_go, ydstogo, shotgun, no_huddle, | |
no_score_prob, ep, wp, def_ypc, label) %>% | |
filter(!is.na(label)) %>% | |
filter(!is.na(down)) | |
nrounds <- 100 | |
params <- | |
list( | |
booster = "gbtree", | |
objective = "multi:softprob", | |
eval_metric = c("mlogloss"), | |
num_class = 26, | |
eta = .025, | |
gamma = 2, | |
subsample=0.8, | |
colsample_bytree=0.8, | |
max_depth = 4, | |
min_child_weight = 1 | |
) | |
smp_size <- floor(0.80 * nrow(rush_attempts4)) | |
set.seed(123) | |
ind <- sample(seq_len(nrow(rush_attempts4)), size = smp_size) | |
ind_train <- rush_attempts4[ind, ] | |
ind_test <- rush_attempts4[-ind, ] | |
full_train <- xgboost::xgb.DMatrix(as.matrix(ind_train %>% select(-label)), label = as.integer(ind_train$label)) | |
ryoe_model <- xgboost::xgboost(params = params, data = full_train, nrounds = nrounds, verbose = 2) | |
imp <- xgb.importance(colnames(ind_train), model = ryoe_model) | |
xgb.plot.importance(imp) | |
rushes_2020 <- rush_attempts3 %>% | |
filter(season == 2020) %>% | |
select(yardline_100, quarter_seconds_remaining, half_seconds_remaining, | |
game_seconds_remaining, qtr, down, goal_to_go, ydstogo, shotgun, no_huddle, | |
no_score_prob, ep, wp, def_ypc) %>% | |
mutate(index = 1:n()) | |
ryoe_2020 <- stats::predict(ryoe_model, | |
as.matrix(rushes_2020 %>% | |
select(yardline_100, quarter_seconds_remaining, half_seconds_remaining, | |
game_seconds_remaining, qtr, down, goal_to_go, ydstogo, shotgun, no_huddle, | |
no_score_prob, ep, wp, def_ypc))) %>% | |
tibble::as_tibble() %>% | |
dplyr::rename(prob = "value") %>% | |
dplyr::bind_cols(purrr::map_dfr(seq_along(rushes_2020$index), function(x) { | |
tibble::tibble("xyds_rushed" = -5:20, | |
"down" = rushes_2020$down[[x]], | |
"yardline_100" = rushes_2020$yardline_100[[x]], | |
"quarter_seconds_remaining" = rushes_2020$quarter_seconds_remaining[[x]], | |
"half_seconds_remaining" = rushes_2020$half_seconds_remaining[[x]], | |
"game_seconds_remaining" = rushes_2020$game_seconds_remaining[[x]], | |
"qtr" = rushes_2020$qtr[[x]], | |
"goal_to_go" = rushes_2020$goal_to_go[[x]], | |
"ydstogo" = rushes_2020$ydstogo[[x]], | |
"shotgun" = rushes_2020$shotgun[[x]], | |
"no_huddle" = rushes_2020$no_huddle[[x]], | |
"no_score_prob" = rushes_2020$no_score_prob[[x]], | |
"ep" = rushes_2020$ep[[x]], | |
"wp" = rushes_2020$wp[[x]], | |
"index" = rushes_2020$index[[x]]) | |
})) %>% | |
dplyr::group_by(.data$index) %>% | |
dplyr::mutate(max_loss = dplyr::if_else(.data$yardline_100 < 95, -5L, as.integer(.data$yardline_100 - 99L)), | |
max_gain = dplyr::if_else(.data$yardline_100 > 20, 20L, as.integer(.data$yardline_100)), | |
cum_prob = cumsum(.data$prob), | |
prob = dplyr::case_when(.data$xyds_rushed == .data$max_loss ~ .data$prob, | |
.data$xyds_rushed == .data$max_gain ~ 1 - dplyr::lag(.data$cum_prob, 1), | |
TRUE ~ .data$prob), | |
yardline_100 = .data$yardline_100 - .data$xyds_rushed) %>% | |
dplyr::filter(.data$xyds_rushed >= .data$max_loss, .data$xyds_rushed <= .data$max_gain) %>% | |
dplyr::select(-.data$cum_prob) %>% | |
dplyr::summarise(x_rush_yards = sum(.data$prob * .data$xyds_rushed)) %>% | |
ungroup() | |
rushes_2020_2 <- rushes_2020 %>% | |
inner_join(ryoe_2020) | |
pbp_2020 <- pbp_rp %>% | |
inner_join(rushes_2020_2) %>% | |
select(posteam, defteam, rusher_player_name, yards_gained, x_rush_yards, epa) %>% | |
mutate(ryoe = yards_gained - x_rush_yards) | |
rushers_2020 <- pbp_2020 %>% | |
filter(!is.na(rusher_player_name)) %>% | |
group_by(rusher_player_name, posteam) %>% | |
summarize(rushes = n(), | |
sum_ryoe = sum(ryoe, na.rm = T), | |
avg_ryoe = mean(ryoe, na.rm =T), | |
mean_epa = mean(epa, na.rm = T)) %>% | |
filter(rushes > 107) %>% | |
arrange(desc(avg_ryoe)) | |
############################################################################ | |
rushers_2020 <- rushers_2020 %>% | |
left_join(teams_colors_logos, by = c('posteam' = 'team_abbr')) | |
scatter_plot <- rushers_2020 %>% | |
ggplot() + | |
geom_smooth(aes(x = mean_epa, y = avg_ryoe), method = "lm", color = "grey") + | |
ggrepel::geom_text_repel( | |
aes(x = mean_epa, y = avg_ryoe, label = rusher_player_name), | |
box.padding = 0.3, size = 5 | |
) + | |
geom_point( | |
aes(x = mean_epa, y = avg_ryoe, size = rushes, fill = team_color, color = team_color2), | |
shape = 21 | |
) + | |
scale_color_identity(aesthetics = c("fill", "color")) + | |
scale_size(name = "Designed Rushes") + | |
theme_minimal() + | |
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + | |
scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) + | |
labs(x = "EPA Per Rush", | |
y = "Rushing Yards Over Expected Per Rush", | |
title = "RYOE and EPA Are Correlated", | |
subtitle = "RYOE is a xgboost model, min. of 105 designed rushes", | |
caption = "By Tej Seth | @mfbanalytics | @deceptivespeed_") + | |
theme( | |
panel.grid.minor = element_blank(), | |
plot.title = element_text(face = "bold", size = 20, hjust = 0.5), | |
plot.subtitle = element_text(size = 10, hjust = 0.5), | |
axis.text = element_text(size = 14), | |
axis.title.y = element_text(size = 14) | |
) | |
scatter_plot | |
ggsave( | |
"ryoe-1.png", scatter_plot, | |
height = 10, width = 16, dpi = "retina" | |
) | |
rusher_faces<- pbp_2020 %>% | |
filter(!is.na(rusher_player_name)) %>% | |
group_by(rusher_player_name, posteam) %>% | |
summarize(rushes = n(), | |
sum_ryoe = sum(ryoe, na.rm = T), | |
avg_ryoe = mean(ryoe, na.rm =T), | |
mean_epa = mean(epa, na.rm = T)) %>% | |
filter(rushes > 75) %>% | |
arrange(desc(avg_ryoe)) | |
write.csv(rusher_faces, "rusher_faces.csv") | |
rusher_faces <- read.csv("~/RYOE/rusher_faces.csv") | |
tab_data <- rusher_faces %>% | |
mutate(RK = as.integer(rank)) %>% | |
select(RK, rusher, headshot, mean_epa, avg_ryoe) %>% | |
arrange(RK) | |
tab_function <- function(data, ...){ | |
data %>% | |
gt() %>% | |
text_transform( | |
locations = cells_body(vars(headshot)), | |
fn = function(x){ | |
web_image( | |
url = x, | |
height = px(25) | |
) | |
} | |
) %>% | |
cols_label( | |
RK = "Rank", | |
rusher = "Rusher", | |
headshot = "", | |
mean_epa = "EPA/Rush", | |
avg_ryoe = "RYOE") %>% | |
data_color( | |
columns = vars(avg_ryoe), | |
colors = scales::col_numeric( | |
palette = c("#af8dc3", "#f7f7f7", "#7fbf7b"), | |
domain = c(-3, 2) | |
) | |
) %>% | |
tab_style( | |
style = cell_text(weight = "bold"), | |
locations = cells_body( | |
columns = vars(RK, rusher) | |
) | |
) %>% | |
tab_header( | |
title = "Top 30 RYOE Rushers", | |
subtitle = "RYOE = Rushing Yards Over Expected" | |
) %>% | |
tab_options( | |
column_labels.background.color = "white", | |
column_labels.font.weight = "bold", | |
table.border.top.width = px(3), | |
table.border.top.color = "transparent", | |
table.border.bottom.color = "transparent", | |
table.border.bottom.width = px(3), | |
column_labels.border.top.width = px(3), | |
column_labels.border.top.color = "transparent", | |
column_labels.border.bottom.width = px(3), | |
column_labels.border.bottom.color = "black", | |
data_row.padding = px(3), | |
source_notes.font.size = 12, | |
table.font.size = 16, | |
heading.align = "middle", | |
... | |
) %>% | |
opt_table_font( | |
font = list( | |
default_fonts() | |
) | |
) | |
} | |
gt_tab1 <- tab_data %>% | |
filter(RK < 31) %>% | |
tab_function() | |
gt_tab1 | |
gtsave(gt_tab1, "gt-tab1.png") | |
gt_tab2 <- tab_data %>% | |
filter(RK >= 31) %>% | |
tab_function() %>% | |
tab_header( | |
title = "Rushers 31-60", | |
subtitle = "RYOE is a xgboost model made by Tej Seth (@mfbanalytics)" | |
) %>% | |
tab_style( | |
style = cell_borders( | |
sides = "left", | |
color = "black", | |
weight = px(3) | |
), | |
locations = | |
list( | |
cells_body( | |
columns = 1 | |
), | |
cells_column_labels(1) | |
) | |
) | |
gt_tab2 | |
gtsave(gt_tab2, "gt-tab2.png") | |
img1 <- magick::image_read("gt-tab1.png") | |
img2 <- magick::image_read("gt-tab2.png") | |
img3 <- magick::image_append(c(img1, img2)) | |
img3 | |
ggsave(img3, "ryoe-2.png") | |
rusher_faces2 <- read.csv("~/RYOE/rusher_faces.csv") | |
rusher_faces2 <- rusher_faces2 %>% | |
left_join(teams_colors_logos, by = c('posteam' = 'team_abbr')) | |
rusher_faces2 <- rusher_faces2 %>% | |
arrange(rank) | |
teams_xp <- pbp_2020 %>% | |
group_by(posteam) %>% | |
summarize(team_avg = mean(x_rush_yards, na.rm = T)) | |
rusher_faces2 <- rusher_faces2 %>% | |
left_join(teams_xp) | |
rusher_faces2 <- rusher_faces2 %>% | |
mutate(yards_gained = avg_ryoe + team_avg, | |
new_rank = row_number()) | |
rusher_faces2 %>% | |
ggplot() + | |
geom_link( | |
mapping = aes(x = team_avg, y = new_rank, xend = yards_gained, yend = new_rank, size = 2, color = team_color) | |
) + | |
theme_bw() + | |
scale_colour_identity() + | |
geom_image(aes(x = team_avg, y = new_rank, image = team_logo_espn), size = 0.04, asp = 16/9) + | |
geom_image(aes(x = yards_gained, y = new_rank, image = headshot), size = 0.04, asp = 16/9) + | |
labs( | |
x = "Rushing Yards Average", | |
y = "", | |
title = "Each Team's Most Used Rusher's RYOE", | |
subtitle = "RYOE = Rushing Yards Over Expected, if a player's face is to the right of their logo they have a positive RYOE", | |
caption = "By Tej Seth | @mfbanalytics | @deceptivespeed_" | |
) + | |
theme( | |
plot.title = element_markdown(hjust = 0.5, size = 20, face = "bold"), | |
plot.subtitle = element_markdown(hjust = 0.5, size = 12), | |
axis.title.y=element_blank(), | |
axis.text.y=element_blank(), | |
axis.ticks.y=element_blank(), | |
legend.position = "none", | |
panel.grid.major.y = element_blank(), | |
panel.grid.minor.y = element_blank(), | |
panel.background = element_blank(), | |
panel.border= element_blank() | |
) + | |
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + | |
scale_y_reverse(breaks = scales::pretty_breaks(n = 10)) | |
ggsave('ryoe-3.png', dpi=300, height=9*.8, width=16*.8) | |
top_rushers <- pbp_2020 %>% | |
filter(!is.na(rusher_player_name)) %>% | |
group_by(rusher_player_name) %>% | |
summarize(rushes = n(), | |
sum_ryoe = sum(ryoe, na.rm = T)) %>% | |
filter(rushes > 107) %>% | |
arrange(desc(sum_ryoe)) %>% | |
filter(sum_ryoe > 0) | |
top_rushers <- top_rushers %>% | |
left_join(rusher_faces) | |
top_rushers <- top_rushers %>% | |
left_join(teams_colors_logos, by = c("posteam" = "team_abbr")) %>% | |
filter(!is.na(rank)) %>% | |
mutate(rank = row_number()) | |
link_to_img <- function(x, width = 50) { | |
glue::glue("<img src='{x}' width='{width}'/>") | |
} | |
bar_plot <- top_rushers %>% | |
mutate(label = link_to_img(headshot), | |
rank = as.integer(rank)) %>% | |
ggplot() + | |
geom_col( | |
aes( | |
x = rank, y = sum_ryoe, | |
fill = team_color, color = team_color2 | |
), | |
width = 0.4 | |
) + | |
geom_image(aes(x = rank, y = sum_ryoe + 5 , image = headshot), asp = 16/9, size = 0.06) + | |
scale_color_identity(aesthetics = c("fill", "color")) + | |
theme_minimal() + | |
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + | |
scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) + | |
labs(x = NULL, | |
y = "Total RYOE\n", | |
title = "The Running Backs 10 in Rushing Yards Over Expected", | |
caption = "By Tej Seth | @mfbanalytics") + | |
theme( | |
panel.grid.minor = element_blank(), | |
plot.title = element_text(face = "bold", size = 20, hjust = 0.5), | |
plot.subtitle = element_text(size = 10, hjust = 0.5), | |
axis.text = element_text(size = 14, face = "bold"), | |
axis.title.y = element_text(size = 16, face = "bold") | |
) | |
bar_plot | |
ggsave( | |
"ryoe-4.png", bar_plot, | |
height = 10, width = 16, dpi = "retina" | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment