Skip to content

Instantly share code, notes, and snippets.

@dfeng
Created October 19, 2012 07:11
Show Gist options
  • Save dfeng/3916670 to your computer and use it in GitHub Desktop.
Save dfeng/3916670 to your computer and use it in GitHub Desktop.
Diving Draft
# event (M3mSB, W3mSB, M10mPF, W10mPF)
# round (prelim, semi, final)
# diver (diver names)
# dcountry (hopefully the 3-character country code like USA)
# difficulty (this is a number like 3.2)
# score (scores range from 0 to 10)
# judge (the judge names, not numbers)
# jcountry (same comment applies as above)
dir <- setwd("/Volumes/HDD/Documents/Work/Current/625/Diving")
# x <- scan("2007_coupe_canada_cup_results_booklets.html", what="", sep="\n", quiet=TRUE, allowEscapes= TRUE)
x <- scan("2009_coupe_canada_cup_resultss.html", what="", sep="\n", quiet=TRUE, allowEscapes= TRUE)
# removing unicode
x <- iconv(x, "latin1", "ASCII", sub="")
# brute: 5265
# hack: search for multiple dots on the line.
# t <- grep("\\.([^\\.]+\\.){4}", x)[1]
# alternate: slightly more robust
num.dots <- sapply(gregexpr("\\.",x),function(x) attributes(x)$match.length)
num.dots <- sapply(num.dots,length)
t <- which(num.dots>6)[1]
# now we want the first line of the first page (starts with a <A name=)
# that has our stats
pages <- grep("name=",x)
split <- max(subset(pages,pages<t))
y <- x[split:length(x)]
# remove html as they're not helpful
# </a> is helpful though
y <- gsub("<[^>/]*>", "", y)
# Q: how to get each event (it's not split by page, which is a hassle)
# A: we look at the titles of each page, and then remove duplicates
# a little more difficult than expected since we unique returns the values
# not the indexes - we want the indexes
# the .*-.* is to remove headings like "</a>Document Outline</h1>"
pageind <- grep("^</a>.*-.*", y)
paget <- grep("^</a>.*-.*", y, value=TRUE)
eventind <- subset(pageind,!duplicated(paget))
# parsing race string to extract event+round
races <- gsub("<[^>]*>", "", y[eventind])
event <- paste(substring(races,1,1), ifelse(grepl("Platform",races),"10mPF","3mSB"), sep="")
round <- ifelse(grepl("Semi", races), "S", ifelse(grepl("Finals", races), "F", "P"))
# final matrix
result <- c()
# split by event
lasteventind <- c(eventind[-1]-1,length(y))
for (i in 1:length(eventind)) {
print(i) # for debugging
page <- y[eventind[i]:lasteventind[i]]
# skip Synch, and other random stuff (found in 2007)
if (!grepl("Synch", races[i]) & grepl("Open", races[i])) {
# judges
judges <- grep("Judge", page, value=TRUE)
jcountries <- sub("^.*- (.*)$","\\1", judges)
jnames <- sub("^.*: (.*) -.*$","\\1", judges)
judgenum <- length(judges)
# divers
dnames <- grep("[A-Z]{2}, [A-Z]{2}", page, value=TRUE)
dnames <- grep(":", dnames, value=TRUE, invert=TRUE)
dnames <- sub("^[0-9]+\\. ","", dnames)
divernum <- length(dnames)
dcountries <- grep("^[A-Z]+$", page, value=TRUE)
stopifnot(divernum == length(dcountries))
# so what we need now is difficulty and score (the hardest part,
# since it's not perfect)
difficulty <- grep("^([0-9]+[A-Z] )?[0-9]\\.[0-9]$", page, value=TRUE)
difficulty <- gsub("^[0-9]+[A-Z] ", "", difficulty)
# to get the scores, we first filter by multiple periods
sub <- grep("\\.([^\\.]+\\.){4}", page, value=TRUE)
scores <- regmatches(sub,gregexpr("[1]?[0-9]\\.[05]",sub))
scorenum <- length(scores)
stopifnot(judgenum == length(scores[[1]]))
# even better test
# stopifnot(sum(duplicated(sapply(scores, length)))+1 == scorenum)
# but we cheat, since most of the time, the problem case is at the last position
# treats the case when we have "10.010.010.010.010.010.010.0 102.00"
# number of dives
stopifnot(scorenum %% divernum == 0)
divesperdiver <- scorenum / divernum
# surprisingly slow
for (j in 1:scorenum) for (k in 1:judgenum) {
# ugly stuff - di is the diver number
di <- (j %/% divesperdiver) %% divernum + 1
row <- c(event[i], round[i], dnames[di], dcountries[di], difficulty[j], scores[[j]][k], jnames[k], jcountries[k])
result <- rbind(result, row)
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment