Created
October 19, 2012 07:11
-
-
Save dfeng/3916670 to your computer and use it in GitHub Desktop.
Diving Draft
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# event (M3mSB, W3mSB, M10mPF, W10mPF) | |
# round (prelim, semi, final) | |
# diver (diver names) | |
# dcountry (hopefully the 3-character country code like USA) | |
# difficulty (this is a number like 3.2) | |
# score (scores range from 0 to 10) | |
# judge (the judge names, not numbers) | |
# jcountry (same comment applies as above) | |
dir <- setwd("/Volumes/HDD/Documents/Work/Current/625/Diving") | |
# x <- scan("2007_coupe_canada_cup_results_booklets.html", what="", sep="\n", quiet=TRUE, allowEscapes= TRUE) | |
x <- scan("2009_coupe_canada_cup_resultss.html", what="", sep="\n", quiet=TRUE, allowEscapes= TRUE) | |
# removing unicode | |
x <- iconv(x, "latin1", "ASCII", sub="") | |
# brute: 5265 | |
# hack: search for multiple dots on the line. | |
# t <- grep("\\.([^\\.]+\\.){4}", x)[1] | |
# alternate: slightly more robust | |
num.dots <- sapply(gregexpr("\\.",x),function(x) attributes(x)$match.length) | |
num.dots <- sapply(num.dots,length) | |
t <- which(num.dots>6)[1] | |
# now we want the first line of the first page (starts with a <A name=) | |
# that has our stats | |
pages <- grep("name=",x) | |
split <- max(subset(pages,pages<t)) | |
y <- x[split:length(x)] | |
# remove html as they're not helpful | |
# </a> is helpful though | |
y <- gsub("<[^>/]*>", "", y) | |
# Q: how to get each event (it's not split by page, which is a hassle) | |
# A: we look at the titles of each page, and then remove duplicates | |
# a little more difficult than expected since we unique returns the values | |
# not the indexes - we want the indexes | |
# the .*-.* is to remove headings like "</a>Document Outline</h1>" | |
pageind <- grep("^</a>.*-.*", y) | |
paget <- grep("^</a>.*-.*", y, value=TRUE) | |
eventind <- subset(pageind,!duplicated(paget)) | |
# parsing race string to extract event+round | |
races <- gsub("<[^>]*>", "", y[eventind]) | |
event <- paste(substring(races,1,1), ifelse(grepl("Platform",races),"10mPF","3mSB"), sep="") | |
round <- ifelse(grepl("Semi", races), "S", ifelse(grepl("Finals", races), "F", "P")) | |
# final matrix | |
result <- c() | |
# split by event | |
lasteventind <- c(eventind[-1]-1,length(y)) | |
for (i in 1:length(eventind)) { | |
print(i) # for debugging | |
page <- y[eventind[i]:lasteventind[i]] | |
# skip Synch, and other random stuff (found in 2007) | |
if (!grepl("Synch", races[i]) & grepl("Open", races[i])) { | |
# judges | |
judges <- grep("Judge", page, value=TRUE) | |
jcountries <- sub("^.*- (.*)$","\\1", judges) | |
jnames <- sub("^.*: (.*) -.*$","\\1", judges) | |
judgenum <- length(judges) | |
# divers | |
dnames <- grep("[A-Z]{2}, [A-Z]{2}", page, value=TRUE) | |
dnames <- grep(":", dnames, value=TRUE, invert=TRUE) | |
dnames <- sub("^[0-9]+\\. ","", dnames) | |
divernum <- length(dnames) | |
dcountries <- grep("^[A-Z]+$", page, value=TRUE) | |
stopifnot(divernum == length(dcountries)) | |
# so what we need now is difficulty and score (the hardest part, | |
# since it's not perfect) | |
difficulty <- grep("^([0-9]+[A-Z] )?[0-9]\\.[0-9]$", page, value=TRUE) | |
difficulty <- gsub("^[0-9]+[A-Z] ", "", difficulty) | |
# to get the scores, we first filter by multiple periods | |
sub <- grep("\\.([^\\.]+\\.){4}", page, value=TRUE) | |
scores <- regmatches(sub,gregexpr("[1]?[0-9]\\.[05]",sub)) | |
scorenum <- length(scores) | |
stopifnot(judgenum == length(scores[[1]])) | |
# even better test | |
# stopifnot(sum(duplicated(sapply(scores, length)))+1 == scorenum) | |
# but we cheat, since most of the time, the problem case is at the last position | |
# treats the case when we have "10.010.010.010.010.010.010.0 102.00" | |
# number of dives | |
stopifnot(scorenum %% divernum == 0) | |
divesperdiver <- scorenum / divernum | |
# surprisingly slow | |
for (j in 1:scorenum) for (k in 1:judgenum) { | |
# ugly stuff - di is the diver number | |
di <- (j %/% divesperdiver) %% divernum + 1 | |
row <- c(event[i], round[i], dnames[di], dcountries[di], difficulty[j], scores[[j]][k], jnames[k], jcountries[k]) | |
result <- rbind(result, row) | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment