Last active
September 30, 2015 00:17
-
-
Save Ray901/12ff59fb161cde99ac28 to your computer and use it in GitHub Desktop.
web scraping ESPN NBA score box for R language
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rm(list=ls()) | |
tic<-Sys.time() | |
############################################################################ | |
setDate<-"20150430" | |
############################################################################ | |
if (!require(XML)) { | |
install.packages('XML') | |
} | |
library(XML) | |
if (!require(RCurl)) { | |
install.packages('RCurl') | |
} | |
library(RCurl) | |
if (!require(xlsx)) { | |
install.packages('xlsx') | |
} | |
library(xlsx) | |
if (!require(miscTools)) { | |
install.packages('miscTools') | |
} | |
library(miscTools) | |
############################################################################ | |
boxShootNumber<-function(shootNumber) { | |
playerShoot<-shootNumber | |
shootString<-unlist(strsplit(playerShoot,split="")) | |
shootM<-as.numeric(paste0(shootString[1:(which(shootString=="-")-1)],collapse="")) | |
shootA<-as.numeric(paste0(shootString[(which(shootString=="-")+1):length(shootString)],collapse="")) | |
return(c(shootM,shootA)) | |
} | |
############################################################################ | |
if (nchar(setDate)==0) { | |
setDate<-paste0(substr(Sys.Date()-1,1,4),substr(Sys.Date()-1,6,7),substr(Sys.Date()-1,9,10)) | |
} | |
html<-readLines(paste0("http://scores.espn.go.com/nba/scoreboard?date=",setDate),warn=F,encoding = "UTF-8") | |
# html<-readLines("http://scores.espn.go.com/nba/scoreboard",warn=F,encoding = "UTF-8") | |
# A <- as.character(unlist(strsplit(html[grep("scoreboard-page",html)],","))) | |
# gamehtml<-substr(A[grep("http://espn.go.com/nba/boxscore",A)],9,56) | |
scriptdata <- html[grep("window.espn.scoreboardData",html)] | |
indexStart <- unlist(gregexpr("gameId",scriptdata)) | |
gameID <- rep(NA,length(indexStart)) | |
for (i in 1:length(gameID)) { | |
gameID[i] <- substr(scriptdata,indexStart[i],indexStart[i]+15) | |
} | |
gameID <- unique(gameID) | |
gamehtml <- paste0("http://espn.go.com/nba/boxscore?",gameID) | |
for (ig in 1:length(gamehtml)) { | |
onehtml<-readLines(gamehtml[ig],warn=F,encoding = "UTF-8") | |
if (length(grep("width=5%>MIN",onehtml))==0) { | |
next | |
} | |
pagetree <- htmlTreeParse(onehtml[(grep("width=5%>MIN",onehtml)[1]):(grep("width=5%>MIN",onehtml)[1]+13)], useInternalNodes = TRUE, encoding='UTF-8') | |
boxFieldName<-xpathSApply(pagetree,'//th',xmlValue) | |
#boxFieldName[13]<-"increase or decrease" | |
pagetree <- htmlTreeParse(onehtml[grep("logo-small logo-nba-small nba-small",onehtml)], useInternalNodes = TRUE, encoding='UTF-8') | |
teamName<-xpathSApply(pagetree,'//th',xmlValue) | |
indexPlayer<-grep("http://espn.go.com/nba/player/_/id/",onehtml)[-1] | |
playerName<-rep(NA,length(indexPlayer)) | |
for (i in 1:length(indexPlayer)) { | |
pagetree <- htmlTreeParse(onehtml[indexPlayer[i]], useInternalNodes = TRUE, encoding='UTF-8') | |
playerName[i]<-xpathSApply(pagetree,'//a',xmlValue) | |
} | |
indexStart<-grep("<th style=\"text-align:left\">STARTERS</th>",onehtml) | |
indexBench<-grep("<th style=\"text-align:left\">BENCH</th>",onehtml) | |
indexOneStart<-indexPlayer[which(indexPlayer<indexBench[1])] | |
indexOneBench<-indexPlayer[intersect(which(indexPlayer<indexStart[2]),which(indexPlayer>indexBench[1]))] | |
indexTwoStart<-indexPlayer[intersect(which(indexPlayer>indexStart[2]),which(indexPlayer<indexBench[2]))] | |
indexTwoBench<-indexPlayer[which(indexPlayer>indexBench[2])] | |
boxscores<-array(NA,dim=c(length(indexPlayer),length(boxFieldName)+1)) | |
for (ip in 1:length(indexPlayer)) { | |
pagetree <- htmlTreeParse(onehtml[indexPlayer[ip]], useInternalNodes = TRUE, encoding='UTF-8') | |
boxscores[ip,1:length(xpathSApply(pagetree,'//td',xmlValue))]<-xpathSApply(pagetree,'//td',xmlValue) | |
} | |
boxscores[,1]<-paste0(sub(", ","(",boxscores[,1]),")") | |
boxscores<-cbind(c( | |
paste0(rep(teamName[1],5),"_Start",collaspe=""), | |
paste0(rep(teamName[1],length(indexOneBench)),"_Bench",collaspe=""), | |
paste0(rep(teamName[2],5),"_Start",collaspe=""), | |
paste0(rep(teamName[2],length(indexTwoBench)),"_Bench",collaspe="") | |
),boxscores) | |
colnames(boxscores)<-c("Team","Name",boxFieldName) | |
totalBoxscore<-array(NA,dim=c(2,dim(boxscores)[2])) | |
pagetree <- htmlTreeParse(onehtml[grep("PTS",onehtml)[c(3,6)]+1], useInternalNodes = TRUE, encoding='UTF-8') | |
TOTAL<-xpathSApply(pagetree,'//td',xmlValue) | |
totalBoxscore[1,]<-c("","TOTAL",TOTAL[1:14]) | |
totalBoxscore[2,]<-c("","TOTAL",TOTAL[21:34]) | |
boxscores<-insertRow( boxscores, | |
max(grep(paste0(teamName[1],"_Bench",collaspe=""),boxscores[,1]))+1, | |
totalBoxscore[1,] ) | |
boxscores<-insertRow( boxscores, | |
max(grep(paste0(teamName[2],"_Bench",collaspe=""),boxscores[,1]))+1, | |
totalBoxscore[2,] ) | |
playerEFFVector<-rep("",dim(boxscores)[1]) | |
indexPlayerEFF<-which(!is.na(suppressWarnings(as.numeric(boxscores[,which(colnames(boxscores)=="MIN")])))) | |
playerFGM<-rep(0,length(indexPlayerEFF)) | |
playerFGA<-rep(0,length(indexPlayerEFF)) | |
playerFTM<-rep(0,length(indexPlayerEFF)) | |
playerFTA<-rep(0,length(indexPlayerEFF)) | |
for (ip in 1:length(indexPlayerEFF)) { | |
FGNumber<-boxShootNumber(as.character(boxscores[indexPlayerEFF[ip],which(colnames(boxscores)=="FGM-A")])) | |
playerFGM[ip]<-FGNumber[1] | |
playerFGA[ip]<-FGNumber[2] | |
FTNumber<-boxShootNumber(as.character(boxscores[indexPlayerEFF[ip],which(colnames(boxscores)=="FTM-A")])) | |
playerFTM[ip]<-FTNumber[1] | |
playerFTA[ip]<-FTNumber[2] | |
} | |
playerEFFVector[indexPlayerEFF]<-as.character( | |
as.numeric(boxscores[indexPlayerEFF,which(colnames(boxscores)=="PTS")])+ | |
as.numeric(boxscores[indexPlayerEFF,which(colnames(boxscores)=="REB")])+ | |
as.numeric(boxscores[indexPlayerEFF,which(colnames(boxscores)=="AST")])+ | |
as.numeric(boxscores[indexPlayerEFF,which(colnames(boxscores)=="STL")])+ | |
as.numeric(boxscores[indexPlayerEFF,which(colnames(boxscores)=="BLK")])- | |
(playerFGA-playerFGM)-(playerFTA-playerFTM)- | |
as.numeric(boxscores[indexPlayerEFF,which(colnames(boxscores)=="TO")]) | |
) | |
boxscores<-cbind(boxscores,playerEFFVector) | |
colnames(boxscores)[length(colnames(boxscores))]<-"EFF" | |
boxscores<-rbind(colnames(boxscores),boxscores) | |
if (ig==1) { | |
write.xlsx(boxscores,paste("d:/NBA_boxscores_",setDate,".xls",sep=""),append=F,row.names=F,col.names=F,sheetName=paste(teamName, collapse = '_VS_')) | |
} else { | |
write.xlsx(boxscores,paste("d:/NBA_boxscores_",setDate,".xls",sep=""),append=T,row.names=F,col.names=F,sheetName=paste(teamName, collapse = '_VS_')) | |
} | |
} | |
############################################################################ | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is great! I'd love to use the results in one data frame to examine the game results. I'll be adapting your code to do that. Want a copy?