Skip to content

Instantly share code, notes, and snippets.

@Ray901
Last active September 30, 2015 00:17
Show Gist options
  • Save Ray901/12ff59fb161cde99ac28 to your computer and use it in GitHub Desktop.
Save Ray901/12ff59fb161cde99ac28 to your computer and use it in GitHub Desktop.
web scraping ESPN NBA score box for R language
rm(list=ls())
tic<-Sys.time()
############################################################################
setDate<-"20150430"
############################################################################
if (!require(XML)) {
install.packages('XML')
}
library(XML)
if (!require(RCurl)) {
install.packages('RCurl')
}
library(RCurl)
if (!require(xlsx)) {
install.packages('xlsx')
}
library(xlsx)
if (!require(miscTools)) {
install.packages('miscTools')
}
library(miscTools)
############################################################################
boxShootNumber<-function(shootNumber) {
playerShoot<-shootNumber
shootString<-unlist(strsplit(playerShoot,split=""))
shootM<-as.numeric(paste0(shootString[1:(which(shootString=="-")-1)],collapse=""))
shootA<-as.numeric(paste0(shootString[(which(shootString=="-")+1):length(shootString)],collapse=""))
return(c(shootM,shootA))
}
############################################################################
if (nchar(setDate)==0) {
setDate<-paste0(substr(Sys.Date()-1,1,4),substr(Sys.Date()-1,6,7),substr(Sys.Date()-1,9,10))
}
html<-readLines(paste0("http://scores.espn.go.com/nba/scoreboard?date=",setDate),warn=F,encoding = "UTF-8")
# html<-readLines("http://scores.espn.go.com/nba/scoreboard",warn=F,encoding = "UTF-8")
# A <- as.character(unlist(strsplit(html[grep("scoreboard-page",html)],",")))
# gamehtml<-substr(A[grep("http://espn.go.com/nba/boxscore",A)],9,56)
scriptdata <- html[grep("window.espn.scoreboardData",html)]
indexStart <- unlist(gregexpr("gameId",scriptdata))
gameID <- rep(NA,length(indexStart))
for (i in 1:length(gameID)) {
gameID[i] <- substr(scriptdata,indexStart[i],indexStart[i]+15)
}
gameID <- unique(gameID)
gamehtml <- paste0("http://espn.go.com/nba/boxscore?",gameID)
for (ig in 1:length(gamehtml)) {
onehtml<-readLines(gamehtml[ig],warn=F,encoding = "UTF-8")
if (length(grep("width=5%>MIN",onehtml))==0) {
next
}
pagetree <- htmlTreeParse(onehtml[(grep("width=5%>MIN",onehtml)[1]):(grep("width=5%>MIN",onehtml)[1]+13)], useInternalNodes = TRUE, encoding='UTF-8')
boxFieldName<-xpathSApply(pagetree,'//th',xmlValue)
#boxFieldName[13]<-"increase or decrease"
pagetree <- htmlTreeParse(onehtml[grep("logo-small logo-nba-small nba-small",onehtml)], useInternalNodes = TRUE, encoding='UTF-8')
teamName<-xpathSApply(pagetree,'//th',xmlValue)
indexPlayer<-grep("http://espn.go.com/nba/player/_/id/",onehtml)[-1]
playerName<-rep(NA,length(indexPlayer))
for (i in 1:length(indexPlayer)) {
pagetree <- htmlTreeParse(onehtml[indexPlayer[i]], useInternalNodes = TRUE, encoding='UTF-8')
playerName[i]<-xpathSApply(pagetree,'//a',xmlValue)
}
indexStart<-grep("<th style=\"text-align:left\">STARTERS</th>",onehtml)
indexBench<-grep("<th style=\"text-align:left\">BENCH</th>",onehtml)
indexOneStart<-indexPlayer[which(indexPlayer<indexBench[1])]
indexOneBench<-indexPlayer[intersect(which(indexPlayer<indexStart[2]),which(indexPlayer>indexBench[1]))]
indexTwoStart<-indexPlayer[intersect(which(indexPlayer>indexStart[2]),which(indexPlayer<indexBench[2]))]
indexTwoBench<-indexPlayer[which(indexPlayer>indexBench[2])]
boxscores<-array(NA,dim=c(length(indexPlayer),length(boxFieldName)+1))
for (ip in 1:length(indexPlayer)) {
pagetree <- htmlTreeParse(onehtml[indexPlayer[ip]], useInternalNodes = TRUE, encoding='UTF-8')
boxscores[ip,1:length(xpathSApply(pagetree,'//td',xmlValue))]<-xpathSApply(pagetree,'//td',xmlValue)
}
boxscores[,1]<-paste0(sub(", ","(",boxscores[,1]),")")
boxscores<-cbind(c(
paste0(rep(teamName[1],5),"_Start",collaspe=""),
paste0(rep(teamName[1],length(indexOneBench)),"_Bench",collaspe=""),
paste0(rep(teamName[2],5),"_Start",collaspe=""),
paste0(rep(teamName[2],length(indexTwoBench)),"_Bench",collaspe="")
),boxscores)
colnames(boxscores)<-c("Team","Name",boxFieldName)
totalBoxscore<-array(NA,dim=c(2,dim(boxscores)[2]))
pagetree <- htmlTreeParse(onehtml[grep("PTS",onehtml)[c(3,6)]+1], useInternalNodes = TRUE, encoding='UTF-8')
TOTAL<-xpathSApply(pagetree,'//td',xmlValue)
totalBoxscore[1,]<-c("","TOTAL",TOTAL[1:14])
totalBoxscore[2,]<-c("","TOTAL",TOTAL[21:34])
boxscores<-insertRow( boxscores,
max(grep(paste0(teamName[1],"_Bench",collaspe=""),boxscores[,1]))+1,
totalBoxscore[1,] )
boxscores<-insertRow( boxscores,
max(grep(paste0(teamName[2],"_Bench",collaspe=""),boxscores[,1]))+1,
totalBoxscore[2,] )
playerEFFVector<-rep("",dim(boxscores)[1])
indexPlayerEFF<-which(!is.na(suppressWarnings(as.numeric(boxscores[,which(colnames(boxscores)=="MIN")]))))
playerFGM<-rep(0,length(indexPlayerEFF))
playerFGA<-rep(0,length(indexPlayerEFF))
playerFTM<-rep(0,length(indexPlayerEFF))
playerFTA<-rep(0,length(indexPlayerEFF))
for (ip in 1:length(indexPlayerEFF)) {
FGNumber<-boxShootNumber(as.character(boxscores[indexPlayerEFF[ip],which(colnames(boxscores)=="FGM-A")]))
playerFGM[ip]<-FGNumber[1]
playerFGA[ip]<-FGNumber[2]
FTNumber<-boxShootNumber(as.character(boxscores[indexPlayerEFF[ip],which(colnames(boxscores)=="FTM-A")]))
playerFTM[ip]<-FTNumber[1]
playerFTA[ip]<-FTNumber[2]
}
playerEFFVector[indexPlayerEFF]<-as.character(
as.numeric(boxscores[indexPlayerEFF,which(colnames(boxscores)=="PTS")])+
as.numeric(boxscores[indexPlayerEFF,which(colnames(boxscores)=="REB")])+
as.numeric(boxscores[indexPlayerEFF,which(colnames(boxscores)=="AST")])+
as.numeric(boxscores[indexPlayerEFF,which(colnames(boxscores)=="STL")])+
as.numeric(boxscores[indexPlayerEFF,which(colnames(boxscores)=="BLK")])-
(playerFGA-playerFGM)-(playerFTA-playerFTM)-
as.numeric(boxscores[indexPlayerEFF,which(colnames(boxscores)=="TO")])
)
boxscores<-cbind(boxscores,playerEFFVector)
colnames(boxscores)[length(colnames(boxscores))]<-"EFF"
boxscores<-rbind(colnames(boxscores),boxscores)
if (ig==1) {
write.xlsx(boxscores,paste("d:/NBA_boxscores_",setDate,".xls",sep=""),append=F,row.names=F,col.names=F,sheetName=paste(teamName, collapse = '_VS_'))
} else {
write.xlsx(boxscores,paste("d:/NBA_boxscores_",setDate,".xls",sep=""),append=T,row.names=F,col.names=F,sheetName=paste(teamName, collapse = '_VS_'))
}
}
############################################################################
@Mpellet771
Copy link

This is great! I'd love to use the results in one data frame to examine the game results. I'll be adapting your code to do that. Want a copy?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment