Skip to content

Instantly share code, notes, and snippets.

@abresler
Created February 5, 2015 02:14
Show Gist options
  • Save abresler/1012f0fc103c6ef17607 to your computer and use it in GitHub Desktop.
Save abresler/1012f0fc103c6ef17607 to your computer and use it in GitHub Desktop.
nba_2015_random_forest_vars
library("ggthemes")
c("randomForest","dplyr","magrittr","ggplot2","ggthemes") -> packages
lapply(packages,library, character.only = TRUE)
getBREFTeamStatTable <- function(season_end = 2015, table_name = 'team', date = T){
c('rvest','dplyr','pipeR','RCurl', 'XML','reshape2') -> packages
lapply(packages, library, character.only = T)
'http://www.basketball-reference.com/leagues/' -> base
(season_end-1) %>>% paste0("-",season_end) -> season
'NBA' -> league
table_name %>>% tolower -> table_name
'#' %>>% paste0(table_name) -> css_page
table_name %>>% paste('stats', sep = "_") -> table
css_page %>>% paste0(" a") -> css_id
base %>>% paste0(league,'_',season_end,".html") -> url
if(table_name == 'standings'){
pipeline({
url
html
html_table(fill = T)
~t
})
if(season_end >= 1971){
t[2] %>>% data.frame %>>% tbl_df -> df
c('team','wins','losses','win_pct','games_back','pts.g','opp_pts.g','srs') -> names(df)
'Eastern' -> df$conference
t[3] %>>% data.frame() %>>% tbl_df -> df2
c('team','wins','losses','win_pct','games_back','pts.g','opp_pts.g','srs') -> names(df2)
'Western' -> df2$conference
rbind_list(df,df2) -> df
df$team %>>% (grepl('\\*',.)) -> df$playoff_team
df$team %>>% (gsub('\\*','',.)) -> df$team
df$team %>>% (colsplit(.,'\\(',c('team','conference_rank'))) %>>%
(.$conference_rank) -> conference_rank
conference_rank %>>% (gsub('\\)','',.)) %>>% as.numeric -> df$conference_rank
df$team %>>% (colsplit(.,'\\(',c('team','conference_rank'))) %>>%
(.$team) %>>% Trim -> df$team
} else{
t[2] %>>% data.frame %>>% tbl_df -> df
c('team','wins','losses','win_pct','games_back','pts.g','opp_pts.g','srs') -> names(df)
df$team %>>% (grep('Western Division',.)) %>>% as.numeric -> div
'' -> df$conference
'Eastern' -> df[1:div,'conference']
'Western' -> df[div:nrow(df),'conference']
if(grep('\\(',df$team) %>>% length > 0){
df$team %>>% (colsplit(.,'\\(',c('team','conference_rank'))) %>>%
(.$conference_rank) -> conference_rank
conference_rank %>>% (gsub('\\)','',.)) %>>% as.numeric -> df$conference_rank
df$team %>>% (colsplit(.,'\\(',c('team','conference_rank'))) %>>%
(.$team) %>>% Trim -> df$team
}
df$team %>>% (grepl('\\*',.)) -> df$playoff_team
df$team %>>% (gsub('\\*','',.)) -> df$team
}
df[df$games_back == '—','games_back'] <- 0
df$games_back %>>% as.numeric -> df$games_back
df %>>%
filter(team == 'Baltimore Bullets'|!is.na(wins)) -> df
df$pts.g - df$opp_pts.g -> df$point_differential
pipeline({
url
html
xpathSApply(path = '//*[(@id = "all_standings")]//a',xmlAttrs)
unlist
~team_url_stems
})
NULL -> names(team_url_stems)
team_url_stems %>>% (gsub('.html|\\/teams|\\/','',.)) %>>%
(gsub(pattern = "[^[:alpha:]]", replacement = "", .)) -> bref_team_id
data.frame(season,table_name = 'standings', bref_team_id, df) -> df
'http://www.basketball-reference.com' %>>% paste0(team_url_stems) -> df$bref_team_season_url
} else{
url %>>% ## get table
html %>>%
html_nodes(css_page) %>>%
html_table(header = F) %>>% data.frame() %>>% tbl_df() -> df
if(df$X1[1] == 'Rk'){
df %>>%
filter(X1 == "Rk") %>>% as.character -> names
'Rk' %>>% grep(x = df$X1) -> row_of_header #find where rank is
(row_of_header + 1) %>>% (df[.:nrow(df),]) -> df #skip that row and go to the end
names %>>% tolower-> names(df)} else{
df %>>%
filter(X1 == "Rk") %>>% as.character -> names
'Rk' %>>% grep(x = df$X1) -> row_of_header #find where rank is
(row_of_header + 1) %>>% (df[.:nrow(df),]) -> df #skip that row and go to the end
names %>>% tolower-> names(df)
}
names(df) %>>% (gsub('\\%|/','\\.',.)) -> names(df)
NULL -> df$rk
c('team','arena') -> table_name_character
df[,!(df %>>% names) %in% table_name_character] %>>% apply(2, function(x) gsub('\\,','',x) %>>%
as.numeric(x)) -> df[,!(df %>>% names) %in% table_name_character] #get rid of commas and make numeric
df$team %>>% grepl(pattern = '\\*') -> df$playoff_team
df$team %>>% (gsub('\\*','',.)) -> df$team
df %>>% nrow() -1 -> rows
df[1:rows,] -> df
paste0("//*[(@id = '",table_name,"')]//a") -> xpath
##Grab Team Ids
url %>>% ## get table
html %>>%
xpathSApply(xpath,xmlAttrs) %>>%
unlist %>>% as.character -> stems
stems %>>% (gsub('\\/|.html|teams','',.)) %>>%
(gsub(season_end,'',.)) -> bref_team_id
data.frame(season,table_name = table, bref_team_id, df) -> df
'http://www.basketball-reference.com' %>>% paste0(stems) -> df$bref_team_season_url
}
if(date == T){
Sys.time() -> df$scrape_time
}
return(df)
}
getBREFTeamStatTable(season_end = 2015) -> stats
getBREFTeamStatTable(season_end = 2015, table_name = 'standings') -> wins
getBREFTeamStatTable(season_end = 2015, table_name = 'opponent') -> opp
getBREFTeamStatTable(season_end = 2015, table_name = 'misc') -> misc
misc %>%
select(bref_team_id:ft.fga.1,attendance,-pl,-pw) -> misc
opp %>%
select(team,bref_team_id,fg:pts.g) -> opp
names(opp)[3:24] %<>% paste0("opp_",.)
wins[,c('team','wins')] %>%
left_join(stats) -> nba_team_data
nba_team_data %>% left_join(opp) -> nba_team_data
misc %>% left_join(nba_team_data) %>%
select(-season,-table_name,-playoff_team,-bref_team_season_url,-scrape_time) -> data_nba
data_nba %>%
select(-team,-bref_team_id, -g,-mp,-mov, -sos, -srs, -pts) -> rf_data
formula = as.formula(wins ~ .)
library(randomForest)
set.seed(Sys.time())
#formula = as.formula(Wins ~ FG + FGA + Three_Point + Three_PointA + FT + FTA +
# ORB + DRB + AST + STL + BLK + TOV + PF + PTS + FG_Opp + FGA_Opp +
# Three_Point_Opp + Three_PointA_Opp + FT_Opp + FTA_Opp + ORB_Opp + DRB_Opp +
# AST_Opp + STL_Opp + BLK_Opp + TOV_Opp + PF_Opp + PTS_Opp + Age)
formula %>%
randomForest(data=rf_data, mtry=5, ntree=10000, importance=TRUE) -> rf
imp = data.frame(importance(rf), check.names=F)
c("MSE.Increase") -> names(imp)[1]
#row.names(imp) -> imp$name
row.names(imp) -> imp$column_name
imp %>% tbl_df -> imp
### Set Correct Names
#"~/Desktop/Github/asb_shiny_apps/nba/explorer/v3/data/advanced_player_stat_table_names.csv" %>%
# read.csv -> adv_name
#"~/Desktop/Github/asb_shiny_apps/nba/explorer/v3/data/per_game_player_stat_table_names.csv" %>%
# read.csv -> stats_names
#rbind(stats_names,adv_name) %>% unique -> table.names
#merge(table.names, imp, all.x = T, all.y = T) %>%
# filter(!is.na(MSE.Increase)) -> imp
#write.csv(imp,'imp.csv')
#c('Attendance','Opponent Effective Field Goal %','FT to FGA','Opponent FT to FGA','Opponent Assists','Opponent Blocks','Opponent Defensive Rebounds','Opponent Field Goals','Opponent Field Goal %','Opponent Field Goal Attempts','Opponent Free Throws','Opponent Free Throw %','Opponent Free Throw Attempts',,,,,,,
# , ->imp[2:60,] -> imp
#'imp.csv' %>% read.csv %>% tbl_df -> imp
imp %>%
ggplot(aes(x=reorder(column_name,MSE.Increase), y=MSE.Increase, fill = name)) +
geom_bar(stat='identity') +
geom_hline(yintercept=abs(min(imp$MSE.Increase)), col=2, linetype='dashed') +
coord_flip() +
theme_fivethirtyeight() +
ylab("MSE Increase") +
theme(legend.position="none") +
scale_y_continuous(breaks=seq(0,40,2)) +
labs(title="NBA Win Contribution Factors via Random Forest, 2014-15 Season",x= "MSE Increase")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment