Skip to content

Instantly share code, notes, and snippets.

@ccagrawal
Last active August 29, 2015 14:08
Show Gist options
  • Save ccagrawal/28f5b06f0578f5357df4 to your computer and use it in GitHub Desktop.
Save ccagrawal/28f5b06f0578f5357df4 to your computer and use it in GitHub Desktop.
NBA Homecourt Advantage
# Quantifies homecourt advantage in each regular season
library(RCurl)
library(XML)
# Get Basketball Reference regular season schedule with home margin of victory for each game
GetSchedule <- function(year) {
url <- paste("http://www.basketball-reference.com/leagues/NBA_", year, "_games.html", sep = "")
tables <- readHTMLTable(url)
schedule <- tables[['games']]
schedule <- schedule[, c(1, 3, 4, 5, 6)]
schedule$Date <- strptime(schedule$Date, format="%a, %b %d, %Y")
schedule[, 3] <- as.numeric(as.character(schedule[, 3]))
schedule[, 5] <- as.numeric(as.character(schedule[, 5]))
colnames(schedule) <- c("date", "awayName", "awayPoints", "homeName", "homePoints")
schedule$homeMargin <- schedule$homePoints - schedule$awayPoints
schedule <- schedule[, c(1, 2, 4, 6)]
schedule$year <- year
return(schedule)
}
# Calculate rolling mean, SD, and confidence interval (rolling over # seasons)
CalcStats <- function(fullSchedule, n, startYear, endYear, confidence) {
# Create data frame to store values
results <- as.data.frame(matrix(data = 0, nrow = (endYear - startYear - n + 2), ncol = 6))
colnames(results) <- c('year', 'games', 'mean', 'sd', 'lb', 'ub')
results$year <- seq(from = (startYear + n - 1), to = endYear)
# Calculate mean, SD, and confidence interval
for (i in 1:nrow(results)) {
years <- seq(from = (results[i, 'year'] - n + 1), to = results[i, 'year'])
schedule <- fullSchedule[fullSchedule$year %in% years, ]
results[i, 'games'] <- nrow(schedule)
results[i, 'mean'] <- mean(schedule$homeMargin)
results[i, 'sd'] <- sd(schedule$homeMargin)
criticalT <- qt(1 - (1 - confidence) / 2, results[i, 'games'] - 1)
results[i, 'lb'] <- results[i, 'mean'] - criticalT * results[i, 'sd']
results[i, 'ub'] <- results[i, 'mean'] + criticalT * results[i, 'sd']
}
return(results)
}
# BBall Ref has schedules from 1950 - 2014
startYear <- 1950
endYear <- 2014
# Download all schedules and merge them into 1 df
fullSchedule <- data.frame()
for (i in startYear:endYear) {
fullSchedule <- rbind(fullSchedule, GetSchedule(i))
cat(i, '\n')
}
results <- CalcStats(fullSchedule, 5, startYear, endYear, .95)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment