Skip to content

Instantly share code, notes, and snippets.

@nathanesau
Created October 8, 2017 22:42
Show Gist options
  • Save nathanesau/0be6fb0447fa0317b953a95a13c63d25 to your computer and use it in GitHub Desktop.
Save nathanesau/0be6fb0447fa0317b953a95a13c63d25 to your computer and use it in GitHub Desktop.
ATP parser
# Title: Tennis Draw Parser
# Author: Nathan Esau
library(XML)
options(warn = -1) # suppress warnings
# Get all draws for current year
url = "https://www.atpworldtour.com/en/scores/results-archive?year=2017"
links <- xpathSApply(doc, "//a/@href")
links <- links[grepl("draws", links)] # doubles and singles
links <- links[grepl("doubles", links) == FALSE]
drawURLs = sapply(links, function(x){paste("https://www.atpworldtour.com", x, collapse='', sep='')})
names(drawURLs) = NULL
# url: the tournament url
getSeeds <- function(url) {
tables = readHTMLTable(getURL(url))
draw = tables[[3]]
round1 = sapply(draw[,1], as.character) # get first round (to get initial bracket and seeds)
rowLengths = sapply(round1, nchar)
names(rowLengths) = NULL
round1 = round1[rowLengths > 5]
patterns = c('\\r', '\\t', '\\n', '[0-9]+', '\\(Q\\)', '\\(', '\\)', 'WC', 'PR')
names = character(0) # parse names from table
for(row in round1) {
x = row
for(pattern in patterns) {
x = gsub(pattern, ' ', x)
}
words = unlist(strsplit(x, " "))
wordsIndex = which(nchar(words) > 0)
dist = diff(wordsIndex)
splitIndex = which(dist > 5)
words = words[nchar(words) > 0]
name1 = paste(words[1:splitIndex], collapse = ' ')
name2 = paste(words[(splitIndex+1):length(words)], collapse = ' ')
names = c(names, name1, name2)
}
names
}
out = vector("list", length(drawURLs))
for(i in 1:length(drawURLs)) {
url = drawURLs[i]
out[[i]] = getSeeds(url)
}
out[[1]]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment