Created
December 19, 2010 15:09
-
-
Save josip/747400 to your computer and use it in GitHub Desktop.
ak_importer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'rubygems' | |
require 'eventmachine' | |
require 'thread' | |
require 'uri' | |
require 'open-uri' | |
require 'nokogiri' | |
require 'ripple' | |
require 'json' | |
BASE_URL = 'http://www.autobusni-kolodvor.com' | |
STATIONS_URL = 'http://www.autobusni-kolodvor.com/stanica.aspx?d=191' | |
DB = Riak::Client.new | |
STATIONS_BUCKET = DB.bucket('stations') | |
LINES_BUCKET = DB.bucket('lines') | |
@stations = [] | |
@parsed_lines = [] | |
UTF_CRAP = /(\303|\302)/ | |
def fix_encoding(xs) | |
xs.gsub(UTF_CRAP, "").gsub(/\205/, "\305").gsub(/\204/, "\304") | |
end | |
def random_string(length) | |
(0...length).map{ ('a'..'z').to_a[rand(26)] }.join | |
end | |
URI_REGEXP = Regexp.new("[^#{URI::PATTERN::UNRESERVED}]") | |
def encode_uri(uri) | |
URI.escape(uri, URI_REGEXP) | |
end | |
def encoded_riak_link(bucket, key, tag) | |
Riak::Link.new(bucket, encode_uri(key), tag) | |
end | |
class BusLine | |
def initialize(exid) | |
@exid = exid | |
@line = Riak::RObject.new(LINES_BUCKET, "bus_#{@exid}") | |
@line.content_type = "application/json" | |
@line.data = { | |
:source => "AK" | |
} | |
get_page() | |
parse_page() | |
end | |
def get_page | |
url = "#{BASE_URL}/stanice.aspx?lin=#{@exid}" | |
puts " < #{url}" | |
@page ||= Nokogiri::HTML(open(url), "UTF-8") | |
end | |
def parse_page | |
rows = @page.css(".row2inverse") | |
timetable = [] | |
stops = [] | |
prev = nil | |
max = rows.count | |
puts " O Line no.#{@exid} | #{max} stops" | |
rows.each_with_index do |row, n| | |
title, arrival, departure = row.children.map {|cell| cell.content.strip} | |
title = fix_encoding(title) | |
timetable << [arrival, departure] | |
puts "-> #{title}@#{arrival} ->" | |
bus_stop = get_stop(title) | |
stops << bus_stop.key | |
unless prev.nil? then | |
prev.links << encoded_riak_link(bus_stop.bucket.name, bus_stop.key, "next_" + @exid) | |
prev.store | |
end | |
bus_stop.store | |
prev = bus_stop | |
if n == 0 || n == max then | |
@line.links << encoded_riak_link(STATIONS_BUCKET, bus_stop.key, (n == 0 ? "first" : "last")) | |
end | |
end | |
@line.data[:timetable] = timetable | |
@line.data[:stops] = stops | |
@line.store | |
end | |
def get_stop(title) | |
stop = STATIONS_BUCKET.get_or_new("bus_#{title}") | |
if stop.data.nil? || stop.data.empty? then | |
stop.content_type = "application/json" | |
exid = "#{@exid}_#{random_string(5)}" | |
stop.data = { | |
# 'secondary' bus stops don't have their own IDs | |
:exid => exid, | |
:country => "HR", | |
:secondary => true, | |
:source => "AK" | |
} | |
puts " # Created station #{title} | #{exid}" | |
end | |
stop | |
end | |
end | |
def get_stations | |
page = Nokogiri::HTML(open("rh_stanice.html")) | |
puts "Stations.html loaded" | |
@stations = page.css(".placeleft3 .placecontent a").map do |a| | |
url = a[:href].split("?s=")[1].split("&d=") | |
title = fix_encoding(a.content.strip) | |
station = Riak::RObject.new(STATIONS_BUCKET, "bus_#{title}") | |
station.content_type = "application/json" | |
station.data = { | |
:exid => url[0], | |
:secondary => false, | |
:country => "HR", | |
:source => "AKZ" | |
} | |
station.store | |
"#{BASE_URL}/stanica.aspx?s=#{url[0]}&d=#{url[1]}" | |
end | |
puts "Stored details about #{@stations.count} station into /riak/stations" | |
end | |
get_stations() | |
@lines = [] | |
def get_lines(page) | |
links = page.css(".cell8 a") | |
links.each do |a| | |
id = a[:href].split("lin=")[1] | |
next if @parsed_lines.include? id | |
@parsed_lines << id | |
BusLine.new(id) | |
end | |
end | |
EM::Iterator.new(@stations, 5).each do |url, iter| | |
data = open(url) | |
iter.next | |
get_lines(Nokogiri::HTML(data)) | |
data = nil | |
end | |
EM.run |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
$KCODE='UTF8' | |
require 'rubygems' | |
require 'open-uri' | |
require 'eventmachine' | |
require 'nokogiri' | |
require 'uri' | |
require 'ripple' | |
@DB = Riak::Client.new | |
@LINES_BUCKET = @DB.bucket('lines') | |
@STATIONS_BUCKET = @DB.bucket('stations') | |
@TRAINS = [] | |
@PARSED_TRAINS = [] | |
def escape_uri(str) | |
URI.escape(str). | |
gsub("%C5%A0", "%A9"). # Š | |
gsub("%C5%A1", "%9A"). # š | |
gsub("%C4%90", "%D0"). # Đ | |
gsub("%C4%91", "%F0"). # đ | |
gsub("%C4%8C", "%C8"). # Č | |
gsub("%C4%8D", "%E8"). # č | |
gsub("%C4%86", "%C6"). # Ć | |
gsub("%C4%87", "%E6"). # ć | |
gsub("%C5%BD", "%8E"). # Ž | |
gsub("%C5%BE", "%9E"). # ž | |
gsub("%20", "+") # <space> | |
end | |
def random_string(length) | |
(0...length).map{ ('a'..'z').to_a[rand(26)] }.join | |
end | |
URI_REGEXP = Regexp.new("[^#{URI::PATTERN::UNRESERVED}]") | |
def encode_uri(uri) | |
URI.escape(uri, URI_REGEXP) | |
end | |
def encoded_riak_link(bucket, key, tag) | |
Riak::Link.new(bucket, encode_uri(key), tag) | |
end | |
class TrainLine | |
def initialize(url, notes) | |
@PARSED_LINES << url | |
@exid = url.split("?VL=")[0].split("&")[0] | |
@line = Riak::RObject.new(LINES, "train_#{@exid}") | |
@line.content_type = "application/json" | |
@line.data = { | |
:exid => @exid, | |
:notes => notes, | |
:source => "HZ" | |
} | |
@p = Nokogiri::HTML(open(url), nil, "windows-1250") | |
get_stops() | |
@line.store | |
end | |
def get_stops | |
rows = @page.css('table:eq(2) tr') | |
timetable = [] | |
stops = [] | |
prev = nil | |
max = rows.count | |
puts " O Train no.#{@exid} | #{max} stops" | |
rows.each_with_index do |row, n| | |
title, arrival, departure = row.css("td").map {|cell| cell.content.strip}[0,3] | |
timetable << [arrival, departure] | |
puts "-> #{title}@#{arrival} ->" | |
train_stop = get_stop(title) | |
stops << bus_stop.key | |
unless prev.nil? then | |
prev.links << encoded_riak_link(train_stop.bucket.name, train_stop.key, "next_" + @exid) | |
prev.store | |
end | |
train_stop.store | |
prev = train_stop | |
if n == 0 || n == max then | |
line.links << encoded_riak_link(STATIONS_BUCKET, train_stop.key, (n == 0 ? "first" : last)) | |
end | |
line.data[:timetable] = timetable | |
line.data[:stops] = stops | |
line.store | |
end | |
def get_stop(title) | |
stop = STATIONS_BUCKET.get_or_new("bus_#{title}") | |
if stop.data.nil? || stop.data.empty? then | |
stop.content_type = "application/json" | |
exid = "#{@exid}_#{random_string(5)}" | |
stop.data = { | |
:exid => exid, | |
:country => "HR", | |
:source => "HZ", | |
:secondary => true | |
} | |
puts " # Created station #{title} | #{exid}" | |
end | |
stop | |
end | |
end | |
end | |
def get_trains(page) | |
return if page.css("font[color='#FF0000']").empty? | |
page.css("table:eq(2) tr").each do |row| | |
url = row.css("a")[0][:href] | |
TrainLine.new(url, row.css("td").last.content.strip) unless @PARSED_LINES.include?(url) | |
end | |
nil | |
end | |
def get_terminals() | |
p = Nokogiri::HTML(open('http://vred.hznet.hr/hzinfo/?category=hzinfo&service=izvr3'), nil, "windows-1250") | |
p.css("select[name=KO] option").map do |terminal| | |
term = escape_uri(terminal.content.strip) | |
terminal = Riak::RObject.new(STATIONS_BUCKET, "train_" + terminal.content.strip) | |
terminal.content_type = "application/json" | |
terminal.data = { | |
:exid => term, | |
:secondary => false, | |
:country => "HR", | |
:source => "HZ" | |
} | |
terminal.store | |
"http://vred.hznet.hr/hzinfo/Default.asp?KO=#{term}&Category=hzinfo&Service=izvr3&LANG=HR&DL=1&OL=2&SCREEN=2" | |
end | |
end | |
EM::Iterator.new(get_terminals(), 5).each do |url, iter| | |
puts url | |
data = open(url) | |
iter.next | |
get_trains(Nokogiri::HTML(data)) | |
data = nil | |
end | |
EM.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment