dfeng/diving.R

## diving.R
# event      (M3mSB, W3mSB, M10mPF, W10mPF)
# round      (prelim, semi, final)
# diver      (diver names)
# dcountry   (hopefully the 3-character country code like USA)
# difficulty (this is a number like 3.2)
# score      (scores range from 0 to 10)
# judge      (the judge names, not numbers)
# jcountry   (same comment applies as above)

dir <- setwd("/Volumes/HDD/Documents/Work/Current/625/Diving")
# x <- scan("2007_coupe_canada_cup_results_booklets.html", what="", sep="\n", quiet=TRUE, allowEscapes= TRUE)
x <- scan("2009_coupe_canada_cup_resultss.html", what="", sep="\n", quiet=TRUE, allowEscapes= TRUE)

# removing unicode
x <- iconv(x, "latin1", "ASCII", sub="")

# brute: 5265
# hack: search for multiple dots on the line.
# t <- grep("\\.([^\\.]+\\.){4}", x)[1]

# alternate: slightly more robust
num.dots <- sapply(gregexpr("\\.",x),function(x) attributes(x)$match.length)
num.dots <- sapply(num.dots,length)
t <- which(num.dots>6)[1]

# now we want the first line of the first page (starts with a <A name=)
# that has our stats
pages <- grep("name=",x)
split <- max(subset(pages,pages<t))
y <- x[split:length(x)]

# remove html as they're not helpful
# </a> is helpful though
y <- gsub("<[^>/]*>", "", y)

# Q: how to get each event (it's not split by page, which is a hassle)
# A: we look at the titles of each page, and then remove duplicates
# a little more difficult than expected since we unique returns the values
# not the indexes - we want the indexes
# the .*-.* is to remove headings like "</a>Document Outline</h1>"
pageind <- grep("^</a>.*-.*", y)
paget <- grep("^</a>.*-.*", y, value=TRUE)
eventind <- subset(pageind,!duplicated(paget))

# parsing race string to extract event+round
races <- gsub("<[^>]*>", "", y[eventind])
event <- paste(substring(races,1,1), ifelse(grepl("Platform",races),"10mPF","3mSB"), sep="")
round <- ifelse(grepl("Semi", races), "S", ifelse(grepl("Finals", races), "F", "P"))

# final matrix
result <- c()

# split by event
lasteventind <- c(eventind[-1]-1,length(y))
for (i in 1:length(eventind)) {
    print(i) # for debugging
    page <- y[eventind[i]:lasteventind[i]]
    # skip Synch, and other random stuff (found in 2007)
    if (!grepl("Synch", races[i]) & grepl("Open", races[i])) {
        # judges
        judges <- grep("Judge", page, value=TRUE)
        jcountries <- sub("^.*- (.*)$","\\1", judges)
        jnames <- sub("^.*: (.*) -.*$","\\1", judges)
        judgenum <- length(judges)

        # divers
        dnames <- grep("[A-Z]{2}, [A-Z]{2}", page, value=TRUE)
        dnames <- grep(":", dnames, value=TRUE, invert=TRUE)
        dnames <- sub("^[0-9]+\\. ","", dnames)
        divernum <- length(dnames)
        dcountries <- grep("^[A-Z]+$", page, value=TRUE)
        stopifnot(divernum == length(dcountries))

        # so what we need now is difficulty and score (the hardest part,
        # since it's not perfect)

        difficulty <- grep("^([0-9]+[A-Z] )?[0-9]\\.[0-9]$", page, value=TRUE)
        difficulty <- gsub("^[0-9]+[A-Z] ", "", difficulty)

        # to get the scores, we first filter by multiple periods
        sub <- grep("\\.([^\\.]+\\.){4}", page, value=TRUE)
        scores <- regmatches(sub,gregexpr("[1]?[0-9]\\.[05]",sub))
        scorenum <- length(scores)
        stopifnot(judgenum == length(scores[[1]]))

        # even better test
        # stopifnot(sum(duplicated(sapply(scores, length)))+1 == scorenum)
        # but we cheat, since most of the time, the problem case is at the last position
        # treats the case when we have "10.010.010.010.010.010.010.0 102.00"

        # number of dives
        stopifnot(scorenum %% divernum == 0)
        divesperdiver <- scorenum / divernum

        # surprisingly slow
        for (j in 1:scorenum) for (k in 1:judgenum) {
            # ugly stuff - di is the diver number
            di <- (j %/% divesperdiver) %% divernum + 1
            row <- c(event[i], round[i], dnames[di], dcountries[di], difficulty[j], scores[[j]][k], jnames[k], jcountries[k])
            result <- rbind(result, row)
        }
    }
}
	# event (M3mSB, W3mSB, M10mPF, W10mPF)
	# round (prelim, semi, final)
	# diver (diver names)
	# dcountry (hopefully the 3-character country code like USA)
	# difficulty (this is a number like 3.2)
	# score (scores range from 0 to 10)
	# judge (the judge names, not numbers)
	# jcountry (same comment applies as above)

	dir <- setwd("/Volumes/HDD/Documents/Work/Current/625/Diving")
	# x <- scan("2007_coupe_canada_cup_results_booklets.html", what="", sep="\n", quiet=TRUE, allowEscapes= TRUE)
	x <- scan("2009_coupe_canada_cup_resultss.html", what="", sep="\n", quiet=TRUE, allowEscapes= TRUE)

	# removing unicode
	x <- iconv(x, "latin1", "ASCII", sub="")

	# brute: 5265
	# hack: search for multiple dots on the line.
	# t <- grep("\\.([^\\.]+\\.){4}", x)[1]

	# alternate: slightly more robust
	num.dots <- sapply(gregexpr("\\.",x),function(x) attributes(x)$match.length)
	num.dots <- sapply(num.dots,length)
	t <- which(num.dots>6)[1]

	# now we want the first line of the first page (starts with a <A name=)
	# that has our stats
	pages <- grep("name=",x)
	split <- max(subset(pages,pages<t))
	y <- x[split:length(x)]

	# remove html as they're not helpful
	# </a> is helpful though
	y <- gsub("<[^>/]*>", "", y)

	# Q: how to get each event (it's not split by page, which is a hassle)
	# A: we look at the titles of each page, and then remove duplicates
	# a little more difficult than expected since we unique returns the values
	# not the indexes - we want the indexes
	# the .-. is to remove headings like "</a>Document Outline</h1>"
	pageind <- grep("^</a>.-.", y)
	paget <- grep("^</a>.-.", y, value=TRUE)
	eventind <- subset(pageind,!duplicated(paget))

	# parsing race string to extract event+round
	races <- gsub("<[^>]*>", "", y[eventind])
	event <- paste(substring(races,1,1), ifelse(grepl("Platform",races),"10mPF","3mSB"), sep="")
	round <- ifelse(grepl("Semi", races), "S", ifelse(grepl("Finals", races), "F", "P"))

	# final matrix
	result <- c()

	# split by event
	lasteventind <- c(eventind[-1]-1,length(y))
	for (i in 1:length(eventind)) {
	print(i) # for debugging
	page <- y[eventind[i]:lasteventind[i]]
	# skip Synch, and other random stuff (found in 2007)
	if (!grepl("Synch", races[i]) & grepl("Open", races[i])) {
	# judges
	judges <- grep("Judge", page, value=TRUE)
	jcountries <- sub("^.- (.)$","\\1", judges)
	jnames <- sub("^.: (.) -.*$","\\1", judges)
	judgenum <- length(judges)

	# divers
	dnames <- grep("[A-Z]{2}, [A-Z]{2}", page, value=TRUE)
	dnames <- grep(":", dnames, value=TRUE, invert=TRUE)
	dnames <- sub("^[0-9]+\\. ","", dnames)
	divernum <- length(dnames)
	dcountries <- grep("^[A-Z]+$", page, value=TRUE)
	stopifnot(divernum == length(dcountries))

	# so what we need now is difficulty and score (the hardest part,
	# since it's not perfect)

	difficulty <- grep("^([0-9]+[A-Z] )?[0-9]\\.[0-9]$", page, value=TRUE)
	difficulty <- gsub("^[0-9]+[A-Z] ", "", difficulty)

	# to get the scores, we first filter by multiple periods
	sub <- grep("\\.([^\\.]+\\.){4}", page, value=TRUE)
	scores <- regmatches(sub,gregexpr("[1]?[0-9]\\.[05]",sub))
	scorenum <- length(scores)
	stopifnot(judgenum == length(scores[[1]]))

	# even better test
	# stopifnot(sum(duplicated(sapply(scores, length)))+1 == scorenum)
	# but we cheat, since most of the time, the problem case is at the last position
	# treats the case when we have "10.010.010.010.010.010.010.0 102.00"

	# number of dives
	stopifnot(scorenum %% divernum == 0)
	divesperdiver <- scorenum / divernum

	# surprisingly slow
	for (j in 1:scorenum) for (k in 1:judgenum) {
	# ugly stuff - di is the diver number
	di <- (j %/% divesperdiver) %% divernum + 1
	row <- c(event[i], round[i], dnames[di], dcountries[di], difficulty[j], scores[[j]][k], jnames[k], jcountries[k])
	result <- rbind(result, row)
	}
	}
	}