Skip to content

Instantly share code, notes, and snippets.

@donarus
Created June 27, 2015 15:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save donarus/3b07c76d996c108dcb61 to your computer and use it in GitHub Desktop.
Save donarus/3b07c76d996c108dcb61 to your computer and use it in GitHub Desktop.
NBA example
library(RCurl)
library(XML)
library(plyr)
library(doParallel)
######## SETTINGS BEGIN
defaultUserAgent <- paste0(
"AcademicAgent. Collecting historic performance and biographical data ",
"of individual players for student academic research purposes only, not for ",
"commercial use or sharing. Webpage containing description of academic research ",
"project: https://www.dropbox.com/sh/ywgcce11euj9iec/AADA7J6yoP5QZD8Tmkv6sBvja?dl=0. ",
"Contact info: youjoo@indiana.edu"
)
injuriesSearchPageTemplate <- "http://www.prosportstransactions.com/basketball/Search/SearchResults.php?Player=&Team=&BeginDate=&EndDate=&ILChkBx=yes&InjuriesChkBx=yes&submit=Search&start=%d"
######## SETTING END
######## FUNCTIONS BEGIN
trim <- function (x) {
gsub("^\\s+|\\s+$", "", x)
}
downloadPageTree <- function(url, userAgent = defaultUserAgent) {
page <- getURL(url, httpheader = c('User-Agent' = userAgent))
pageLines <- readLines(tc <- textConnection(page)); close(tc)
htmlTreeParse(pageLines, error=function(...){}, useInternalNodes = TRUE)
}
######## FUNCTIONS END
pagetree <- downloadPageTree(sprintf(injuriesSearchPageTemplate, 0))
pagesCnt <- as.numeric(xpathSApply(pagetree, '/html/body/div[3]/table[2]/tr/td[3]/p/a[last()]/text()', xmlValue)[[1]])
print("## Downloading player injuries pages")
pages <- llply(0:(pagesCnt - 1), function(i) {
if((i+1) %% 100 == 0) {
tts <- sample(20,1)
print(sprintf("Sleeping for %d seconds after 100 downloaded pages", tts))
Sys.sleep(tts)
}
tts <- sample(5,1)
print(sprintf('downloading page %d/%d and sleeping for %d seconds after download is finished', (i+1), pagesCnt, tts))
pagetree <- downloadPageTree(sprintf(injuriesSearchPageTemplate, i))
Sys.sleep(tts) # just random sleep after each request
pagetree
}, .progress = "time")
players <- unique(unlist(llply(pages, function(page) {
acquired <- xpathSApply(page, '/html/body/div[3]/table[1]/tr/td[3]', xmlValue)
relinquished <- xpathSApply(page, '/html/body/div[3]/table[1]/tr/td[4]', xmlValue)
res <- list()
if (length(acquired) > 1) {
res <- c(acquired[2:length(acquired)], relinquished[2:length(acquired)])
}
res <- unlist(strsplit(res, "/"))
res <- unlist(llply(res, trim))
res <- res[res != ""]
res
})))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment