Created
February 16, 2012 04:03
-
-
Save ashleybot/1841870 to your computer and use it in GitHub Desktop.
Use Nokogiri and open-uri to parse Golden Gate Trail Run results from HTML to XML.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Use Nokogiri and open-uri to parse Golden Gate Trail Run results from HTML to XML. | |
# 2012.02.13 | |
# | |
require 'rubygems' | |
require 'nokogiri' | |
require 'open-uri' | |
# Race results for the Golden Gate Trail Run in San Francisco, California on Saturday, February 11, 2012. | |
race_results_page = Nokogiri::HTML(open("http://www.coastaltrailruns.com/gg_results_12.htm")) | |
builder = Nokogiri::XML::Builder.new do |xml| | |
xml.root { | |
is_half_marathon_result = false | |
# The results for all the races that day are displayed in a single table. | |
race_results_page.xpath("//tr").each do |row| | |
# Half marathon results begin at the merged cell with "Half Marathon" in h1 tags and end at '30 Km'. | |
half_marathon_row = row.xpath("td/h1[font='Half Marathon']/../..") | |
if !half_marathon_row.empty? | |
is_half_marathon_result = true | |
end | |
thirty_kilo_row = row.xpath("td/h1[font='30 Km']/../..") | |
if !thirty_kilo_row.empty? | |
is_half_marathon_result = false | |
end | |
if is_half_marathon_result | |
if row.xpath("td[@colspan]").empty? | |
# W3C specs state the first element is at 1 | |
@place = row.xpath("td[1]/p/font/text()").to_s # 000 | |
if !@place.empty? | |
@name = row.xpath("td[2]/font/text()").to_s # First Last | |
@city = row.xpath("td[3]/font/text()").to_s # City State (no punctuation) | |
@bib = row.xpath("td[4]/p/font/text()").to_s # 000 | |
@age = row.xpath("td[5]/p/font/text()").to_s # 00 | |
# @group = row.xpath("td[6]/p/font/text()").to_s # 00 M 00-00 (place gender age-range) | |
@gender = "N" | |
@place_by_group = @place | |
@age_range = "" | |
@group = row.xpath("td[6]/p/font/text()").to_s.strip.split(" ") | |
if @group.length > 2 | |
@place_by_group = @group[0] | |
@gender = @group[1] | |
@age_range = @group[2] | |
end | |
@time = row.xpath("td[7]/p/font/text()").to_s # 0:00:00 | |
@pace = row.xpath("td[8]/p/font/text()").to_s # 00:00/M | |
xml.runner(:place => @place, :age => @age, :gender => @gender, :age_range => @age_range, :place_by_group => @place_by_group) { | |
xml.name @name | |
xml.city @city | |
xml.bib @bib | |
xml.time @time | |
xml.pace @pace | |
} | |
end | |
end | |
end | |
end | |
} | |
end | |
# delete if exists | |
File.delete("GoldenGate2012Results.xml") if File.file?("GoldenGate2012Results.xml") | |
# write xml | |
f = File.new("GoldenGate2012Results.xml","w") | |
f.puts builder.to_xml | |
f.close |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment