Created
December 23, 2009 09:54
-
-
Save LouisStAmour/262436 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' # A script to pull and parse just routedetails | |
require 'jdbc/sqlite3' # Written in jRuby with jdbc-sqlite3 and celerity installed | |
require 'celerity' # Trust me, you'd prefer http://toronto.ca/open to this | |
# I'm using this to test the open dataset given as of Oct 27 | |
# Assuming the following structure for database ... | |
# # DROP TABLE ttcca_routedetails; DROP TABLE ttcca_scheduleitem; DROP TABLE ttcca_routestop; DROP TABLE ttcca_stopdetails; | |
# | |
# sql = <<SQL | |
# create table ttcca_routedetails ( | |
# routeid TEXT, -- apparently not unique? | |
# branchcode TEXT, -- "_" if nil | |
# version INT, | |
# description TEXT, | |
# routename TEXT | |
# ); | |
# create table ttcca_scheduleitem ( | |
# routeid TEXT, -- apparently not unique? | |
# branchcode TEXT, | |
# version INT, | |
# stopid INT, -- apparently not unique? | |
# day INT, -- should index | |
# time INT -- time converted to int | |
# ); | |
# create table ttcca_routestop ( | |
# routeid TEXT, -- apparently not unique? | |
# branchcode TEXT, | |
# version INT, | |
# stopid INT, -- apparently not unique? | |
# routestopid INT, | |
# routestopdirection INT | |
# ); | |
# create table ttcca_stopdetails ( | |
# stopid INT, -- apparently not unique? | |
# station INT, | |
# name TEXT, | |
# accessible INT, | |
# subwayconnection INT, | |
# nearestnode INT -- guessed; not TTC provided, this links to the centreline data from toronto.ca/open | |
# -- nearestnode is -1 if data is unavailable. | |
# ); | |
# SQL | |
database_path = "jdbc:sqlite:db.sqlite" | |
Java::org.sqlite.JDBC #init the driver | |
conn = java.sql.DriverManager.getConnection(database_path) | |
puts "Loading routedetails: routeid, branchcode, version, description, routename" | |
prep = conn.prepareStatement("insert into ttcca_routedetails values (?, ?, 200912, ?, ?);") | |
browser = Celerity::Browser.new({ :javascript_enabled => true, :css => false }) | |
browser.goto("http://www3.ttc.ca/Routes/Buses.jsp") | |
bus_links = browser.elements_by_xpath "//div[@id='ttc-main-content']/div[1]/div/ul/li/a" | |
# returns an array of <Celerity::Element>s, all links with contents like: | |
# 353 Steeles East <span class="access">Accessible</span> <span class="access">Bicycle Rack</span> | |
bus_links.each do |b| | |
b.click | |
h1 = browser.element_by_xpath "//h1[@id='ttc-route-name']" | |
puts h1.inner_text | |
parsed = h1.inner_text.split(" ") # => ["5", "Avenue", "Rd", "Northbound"] | |
routeid = parsed[0]+parsed[-1][0].chr # eg. 5N | |
routename = parsed[1,parsed.length-2].join(" ").upcase # => "AVENUE RD" | |
routes = browser.elements_by_xpath "//div[@id='ttc-dir-1']/div/ul/li" | |
routes.each do |r| | |
branchcode = r.object.children.first.getAttribute("class")[-1].chr # => "_" | |
description = r.object.children.first.getNextSibling.asText # => "5 AVENUE RD To EGLINTON STN" | |
prep.setString(1, routeid) | |
prep.setString(2, branchcode) | |
prep.setString(3, description) | |
prep.setString(4, routename) | |
prep.addBatch | |
end | |
reverse_direction_link = browser.element_by_xpath "//ul[@id='ttc-route-tabs']/li[2]/a" | |
puts reverse_direction_link.inner_text | |
routeid = parsed[0]+reverse_direction_link.inner_text[0].chr | |
reverse_direction_link.click | |
routes = browser.elements_by_xpath "//div[@id='ttc-dir-1']/div/ul/li" | |
routes.each do |r| | |
branchcode = r.object.children.first.getAttribute("class")[-1].chr # => "_" | |
description = r.object.children.first.getNextSibling.asText # => "5 AVENUE RD To EGLINTON STN" | |
prep.setString(1, routeid) | |
prep.setString(2, branchcode) | |
prep.setString(3, description) | |
prep.setString(4, routename) | |
prep.addBatch | |
end | |
end | |
browser.goto("http://www3.ttc.ca/Routes/Streetcars.jsp") | |
bus_links = browser.elements_by_xpath "//div[@id='ttc-main-content']/div[1]/div/ul/li/a" | |
# returns an array of <Celerity::Element>s, all links with contents like: | |
# 353 Steeles East <span class="access">Accessible</span> <span class="access">Bicycle Rack</span> | |
bus_links.each do |b| | |
b.click | |
h1 = browser.element_by_xpath "//h1[@id='ttc-route-name']" | |
puts h1.inner_text | |
parsed = h1.inner_text.split(" ") # => ["5", "Avenue", "Rd", "Northbound"] | |
routeid = parsed[0]+parsed[-1][0].chr # eg. 5N | |
routename = parsed[1,parsed.length-2].join(" ").upcase # => "AVENUE RD" | |
routes = browser.elements_by_xpath "//div[@id='ttc-dir-1']/div/ul/li" | |
routes.each do |r| | |
branchcode = r.object.children.first.getAttribute("class")[-1].chr # => "_" | |
description = r.object.children.first.getNextSibling.asText # => "5 AVENUE RD To EGLINTON STN" | |
prep.setString(1, routeid) | |
prep.setString(2, branchcode) | |
prep.setString(3, description) | |
prep.setString(4, routename) | |
prep.addBatch | |
end | |
reverse_direction_link = browser.element_by_xpath "//ul[@id='ttc-route-tabs']/li[2]/a" | |
puts reverse_direction_link.inner_text | |
routeid = parsed[0]+reverse_direction_link.inner_text[0].chr | |
reverse_direction_link.click | |
routes = browser.elements_by_xpath "//div[@id='ttc-dir-1']/div/ul/li" | |
routes.each do |r| | |
branchcode = r.object.children.first.getAttribute("class")[-1].chr # => "_" | |
description = r.object.children.first.getNextSibling.asText # => "5 AVENUE RD To EGLINTON STN" | |
prep.setString(1, routeid) | |
prep.setString(2, branchcode) | |
prep.setString(3, description) | |
prep.setString(4, routename) | |
prep.addBatch | |
end | |
end | |
puts "Committing ..." | |
conn.setAutoCommit false | |
prep.executeBatch | |
conn.setAutoCommit true | |
conn.close |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment