Skip to content

Instantly share code, notes, and snippets.

@gallochris
Last active January 11, 2025 20:42
Show Gist options
  • Save gallochris/49bf7bc8ee72c4dbec3fbfe653d85cc1 to your computer and use it in GitHub Desktop.
Save gallochris/49bf7bc8ee72c4dbec3fbfe653d85cc1 to your computer and use it in GitHub Desktop.
This is an attempt to guess how the NET is computed, inspired from bchare work
# This is a script to guess _how_ the NET is computed
# Credit for almost all of this work goes to bchare: https://github.com/bchare/ncaabball
# This is an attempt to transform his Python code into R
# -----------------------------------------------------------
# Gamelogs are loaded from the following repo:
# This fetches data from cbbdata/torvik and saves a daily csv
# Job runs around 7 am ET each day to fetch data
gamelogs <- readr::read_csv(
"https://raw.githubusercontent.com/gallochris/evdev-byc/refs/heads/main/data/cbb_daily_gamelog.csv"
)
# Add games and calculate the scoring margin in points per 100 possessions
games <- gamelogs |>
dplyr::mutate(
raw_net_eff = 100 * (
pts / (fga - oreb + to + 0.475 * fta) -
opp_pts / (opp_fga - opp_oreb + opp_to + 0.475 * opp_fta)
),
pts_dif = pts - opp_pts,
avg_pos = 0.5 * (
(fga - oreb + to + 0.475 * fta) +
(opp_fga - opp_oreb + opp_to + 0.475 * opp_fta)
),
hca = dplyr::case_match(location, "H" ~ 1, # this matches the code in the original data examples
"A" ~ -1, # see here: https://github.com/bchare/ncaabball/blob/main/ncaab_stats_input_net_2025.csv
"N" ~ 0)
)
# -----------------------------------------------------------
# Function that mirrors Python's sklearn's implementation
# This was a lot of trial and error
# Goal is adjusting for opponent and location
custom_ridge <- function(X, y, alpha = 1) {
X <- cbind(1, X)
n <- nrow(X)
p <- ncol(X)
penalty <- diag(p)
penalty[1, 1] <- 0
# Solve ridge regression (matches sklearn implementation)
beta <- solve(t(X) %*% X + alpha * penalty, t(X) %*% y)
list(coefficients = beta[-1], intercept = beta[1])
}
# Dummy variables
team_dummies <- model.matrix( ~ 0 + team, data = games)
opp_dummies <- model.matrix( ~ 0 + opp, data = games)
hca_dummy <- model.matrix( ~ 0 + hca, data = games)
# Combine all variables
games_dummy_vars <- cbind(team_dummies, opp_dummies, hca_dummy)
# Fit ridge regression
ridge_fit <- custom_ridge(games_dummy_vars, games$raw_net_eff, alpha = 1)
# Add data frame
net_stats <- tibble::tibble(
team = colnames(games_dummy_vars),
efficiency = c(ridge_fit$coefficients) + ridge_fit$intercept # Add intercept to match Python
)
# Determine home court advantage
home_court_advantage <- net_stats |>
dplyr::filter(team == "hca") |>
dplyr::pull(efficiency)
message("Home Court Advantage is ",
round(home_court_advantage, 2),
" Points Per 100 Possessions")
# Compute the team stats
net_stats <- net_stats |>
dplyr::filter(stringr::str_detect(team, "^team")) |>
dplyr::mutate(
team = stringr::str_remove(team, "^team"),
efficiency_rtg = 100 * pnorm(efficiency, mean(efficiency), sd(efficiency)),
efficiency_rank = rank(-efficiency, ties.method = "min")
)
# Bradley-Terry model and add fictional game
# This is the first step towards the "value" ranking -so not efficiency
# Is this what is used for WAB?
fiction <- games |>
dplyr::select(team) |>
unique() |>
dplyr::mutate(
opp = "ZZZ_FICTIONAL",
hca = 0,
pts = 100,
opp_pts = 101
)
fiction2 <- fiction |>
dplyr::mutate(opp_pts = 99)
games <- dplyr::bind_rows(games, fiction, fiction2)
teams <- sort(unique(c(games$team, games$opp)))
# Function to create game vectors
get_vector <- function(game_row) {
result <- numeric(length(teams) + 1) # +1 for y
names(result) <- c("y", teams)
result["y"] <- as.numeric(game_row$pts > game_row$opp_pts)
result[game_row$team] <- 1
result[game_row$opp] <- -1
return(result)
}
model_matrix <- do.call(rbind, lapply(split(games, seq(nrow(
games
))), get_vector))
# Convert to data frame and add HCA
model_data <- as.data.frame(model_matrix)
model_data$hca <- games$hca
# Add logistic regression
y <- model_data$y
X <- model_data |> dplyr::select(-y)
# Fit logistic regression
log_model <- glm(y ~ . - 1, data = cbind(y = y, X), family = binomial())
# Return team values
teamvalue <- tibble::tibble(team = names(coef(log_model)), value = as.vector(coef(log_model))) |>
dplyr::filter(team != "ZZZ_FICTIONAL", team != "hca") |>
dplyr::mutate(
value_rtg = 100 * pnorm(value, mean(value), sd(value)),
value_rank = rank(-value, ties.method = "min"),
team = stringr::str_remove_all(team, "`")
)
# Combine metrics and calculate estimated NET ranking
net_stats <- net_stats |>
dplyr::inner_join(teamvalue, by = "team") |>
dplyr::mutate(estimated_net = rank(-(0.7999 * efficiency_rtg + 0.2001 * value_rtg), ties.method = "min")) |>
dplyr::arrange(estimated_net) |>
dplyr::mutate(
efficiency_rank = as.integer(efficiency_rank),
value_rank = as.integer(value_rank),
estimated_net = as.integer(estimated_net)
) |>
dplyr::relocate(estimated_net)
# -----------------------------
# Compare the estimated NET to the real NET
# Function to clean names of certain teams
team_name_update <- function(team_var) {
team_var = dplyr::case_match(
team_var,
"Charleston" ~ "College of Charleston",
"LIU" ~ "LIU Brooklyn",
"Detroit Mercy" ~ "Detroit",
"Purdue Fort Wayne" ~ "Fort Wayne",
"Louisiana" ~ "Louisiana Lafayette",
"N.C. State" ~ "North Carolina St.",
"IU Indy" ~ "IUPUI",
"Saint Francis" ~ "St. Francis PA",
team_var ~ team_var
)
}
# Fetch the real NET
real_net <- cbbdata::cbd_torvik_current_resume() |>
dplyr::select(team, net) |>
dplyr::filter(team != "Team") |>
dplyr::mutate(team = team_name_update(team))
# Now join the data and compare it
compare_net <- net_stats |>
dplyr::left_join(real_net, by = "team") |>
dplyr::mutate(net_diff = estimated_net - net) |>
dplyr::relocate(net_diff, .before = estimated_net)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment