Last active
January 11, 2025 20:42
-
-
Save gallochris/49bf7bc8ee72c4dbec3fbfe653d85cc1 to your computer and use it in GitHub Desktop.
This is an attempt to guess how the NET is computed, inspired from bchare work
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is a script to guess _how_ the NET is computed | |
# Credit for almost all of this work goes to bchare: https://github.com/bchare/ncaabball | |
# This is an attempt to transform his Python code into R | |
# ----------------------------------------------------------- | |
# Gamelogs are loaded from the following repo: | |
# This fetches data from cbbdata/torvik and saves a daily csv | |
# Job runs around 7 am ET each day to fetch data | |
gamelogs <- readr::read_csv( | |
"https://raw.githubusercontent.com/gallochris/evdev-byc/refs/heads/main/data/cbb_daily_gamelog.csv" | |
) | |
# Add games and calculate the scoring margin in points per 100 possessions | |
games <- gamelogs |> | |
dplyr::mutate( | |
raw_net_eff = 100 * ( | |
pts / (fga - oreb + to + 0.475 * fta) - | |
opp_pts / (opp_fga - opp_oreb + opp_to + 0.475 * opp_fta) | |
), | |
pts_dif = pts - opp_pts, | |
avg_pos = 0.5 * ( | |
(fga - oreb + to + 0.475 * fta) + | |
(opp_fga - opp_oreb + opp_to + 0.475 * opp_fta) | |
), | |
hca = dplyr::case_match(location, "H" ~ 1, # this matches the code in the original data examples | |
"A" ~ -1, # see here: https://github.com/bchare/ncaabball/blob/main/ncaab_stats_input_net_2025.csv | |
"N" ~ 0) | |
) | |
# ----------------------------------------------------------- | |
# Function that mirrors Python's sklearn's implementation | |
# This was a lot of trial and error | |
# Goal is adjusting for opponent and location | |
custom_ridge <- function(X, y, alpha = 1) { | |
X <- cbind(1, X) | |
n <- nrow(X) | |
p <- ncol(X) | |
penalty <- diag(p) | |
penalty[1, 1] <- 0 | |
# Solve ridge regression (matches sklearn implementation) | |
beta <- solve(t(X) %*% X + alpha * penalty, t(X) %*% y) | |
list(coefficients = beta[-1], intercept = beta[1]) | |
} | |
# Dummy variables | |
team_dummies <- model.matrix( ~ 0 + team, data = games) | |
opp_dummies <- model.matrix( ~ 0 + opp, data = games) | |
hca_dummy <- model.matrix( ~ 0 + hca, data = games) | |
# Combine all variables | |
games_dummy_vars <- cbind(team_dummies, opp_dummies, hca_dummy) | |
# Fit ridge regression | |
ridge_fit <- custom_ridge(games_dummy_vars, games$raw_net_eff, alpha = 1) | |
# Add data frame | |
net_stats <- tibble::tibble( | |
team = colnames(games_dummy_vars), | |
efficiency = c(ridge_fit$coefficients) + ridge_fit$intercept # Add intercept to match Python | |
) | |
# Determine home court advantage | |
home_court_advantage <- net_stats |> | |
dplyr::filter(team == "hca") |> | |
dplyr::pull(efficiency) | |
message("Home Court Advantage is ", | |
round(home_court_advantage, 2), | |
" Points Per 100 Possessions") | |
# Compute the team stats | |
net_stats <- net_stats |> | |
dplyr::filter(stringr::str_detect(team, "^team")) |> | |
dplyr::mutate( | |
team = stringr::str_remove(team, "^team"), | |
efficiency_rtg = 100 * pnorm(efficiency, mean(efficiency), sd(efficiency)), | |
efficiency_rank = rank(-efficiency, ties.method = "min") | |
) | |
# Bradley-Terry model and add fictional game | |
# This is the first step towards the "value" ranking -so not efficiency | |
# Is this what is used for WAB? | |
fiction <- games |> | |
dplyr::select(team) |> | |
unique() |> | |
dplyr::mutate( | |
opp = "ZZZ_FICTIONAL", | |
hca = 0, | |
pts = 100, | |
opp_pts = 101 | |
) | |
fiction2 <- fiction |> | |
dplyr::mutate(opp_pts = 99) | |
games <- dplyr::bind_rows(games, fiction, fiction2) | |
teams <- sort(unique(c(games$team, games$opp))) | |
# Function to create game vectors | |
get_vector <- function(game_row) { | |
result <- numeric(length(teams) + 1) # +1 for y | |
names(result) <- c("y", teams) | |
result["y"] <- as.numeric(game_row$pts > game_row$opp_pts) | |
result[game_row$team] <- 1 | |
result[game_row$opp] <- -1 | |
return(result) | |
} | |
model_matrix <- do.call(rbind, lapply(split(games, seq(nrow( | |
games | |
))), get_vector)) | |
# Convert to data frame and add HCA | |
model_data <- as.data.frame(model_matrix) | |
model_data$hca <- games$hca | |
# Add logistic regression | |
y <- model_data$y | |
X <- model_data |> dplyr::select(-y) | |
# Fit logistic regression | |
log_model <- glm(y ~ . - 1, data = cbind(y = y, X), family = binomial()) | |
# Return team values | |
teamvalue <- tibble::tibble(team = names(coef(log_model)), value = as.vector(coef(log_model))) |> | |
dplyr::filter(team != "ZZZ_FICTIONAL", team != "hca") |> | |
dplyr::mutate( | |
value_rtg = 100 * pnorm(value, mean(value), sd(value)), | |
value_rank = rank(-value, ties.method = "min"), | |
team = stringr::str_remove_all(team, "`") | |
) | |
# Combine metrics and calculate estimated NET ranking | |
net_stats <- net_stats |> | |
dplyr::inner_join(teamvalue, by = "team") |> | |
dplyr::mutate(estimated_net = rank(-(0.7999 * efficiency_rtg + 0.2001 * value_rtg), ties.method = "min")) |> | |
dplyr::arrange(estimated_net) |> | |
dplyr::mutate( | |
efficiency_rank = as.integer(efficiency_rank), | |
value_rank = as.integer(value_rank), | |
estimated_net = as.integer(estimated_net) | |
) |> | |
dplyr::relocate(estimated_net) | |
# ----------------------------- | |
# Compare the estimated NET to the real NET | |
# Function to clean names of certain teams | |
team_name_update <- function(team_var) { | |
team_var = dplyr::case_match( | |
team_var, | |
"Charleston" ~ "College of Charleston", | |
"LIU" ~ "LIU Brooklyn", | |
"Detroit Mercy" ~ "Detroit", | |
"Purdue Fort Wayne" ~ "Fort Wayne", | |
"Louisiana" ~ "Louisiana Lafayette", | |
"N.C. State" ~ "North Carolina St.", | |
"IU Indy" ~ "IUPUI", | |
"Saint Francis" ~ "St. Francis PA", | |
team_var ~ team_var | |
) | |
} | |
# Fetch the real NET | |
real_net <- cbbdata::cbd_torvik_current_resume() |> | |
dplyr::select(team, net) |> | |
dplyr::filter(team != "Team") |> | |
dplyr::mutate(team = team_name_update(team)) | |
# Now join the data and compare it | |
compare_net <- net_stats |> | |
dplyr::left_join(real_net, by = "team") |> | |
dplyr::mutate(net_diff = estimated_net - net) |> | |
dplyr::relocate(net_diff, .before = estimated_net) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment