Skip to content

Instantly share code, notes, and snippets.

@josip
Created December 19, 2010 15:09
Show Gist options
  • Save josip/747400 to your computer and use it in GitHub Desktop.
Save josip/747400 to your computer and use it in GitHub Desktop.
ak_importer
#!/usr/bin/env ruby
require 'rubygems'
require 'eventmachine'
require 'thread'
require 'uri'
require 'open-uri'
require 'nokogiri'
require 'ripple'
require 'json'
BASE_URL = 'http://www.autobusni-kolodvor.com'
STATIONS_URL = 'http://www.autobusni-kolodvor.com/stanica.aspx?d=191'
DB = Riak::Client.new
STATIONS_BUCKET = DB.bucket('stations')
LINES_BUCKET = DB.bucket('lines')
@stations = []
@parsed_lines = []
UTF_CRAP = /(\303|\302)/
def fix_encoding(xs)
xs.gsub(UTF_CRAP, "").gsub(/\205/, "\305").gsub(/\204/, "\304")
end
def random_string(length)
(0...length).map{ ('a'..'z').to_a[rand(26)] }.join
end
URI_REGEXP = Regexp.new("[^#{URI::PATTERN::UNRESERVED}]")
def encode_uri(uri)
URI.escape(uri, URI_REGEXP)
end
def encoded_riak_link(bucket, key, tag)
Riak::Link.new(bucket, encode_uri(key), tag)
end
class BusLine
def initialize(exid)
@exid = exid
@line = Riak::RObject.new(LINES_BUCKET, "bus_#{@exid}")
@line.content_type = "application/json"
@line.data = {
:source => "AK"
}
get_page()
parse_page()
end
def get_page
url = "#{BASE_URL}/stanice.aspx?lin=#{@exid}"
puts " < #{url}"
@page ||= Nokogiri::HTML(open(url), "UTF-8")
end
def parse_page
rows = @page.css(".row2inverse")
timetable = []
stops = []
prev = nil
max = rows.count
puts " O Line no.#{@exid} | #{max} stops"
rows.each_with_index do |row, n|
title, arrival, departure = row.children.map {|cell| cell.content.strip}
title = fix_encoding(title)
timetable << [arrival, departure]
puts "-> #{title}@#{arrival} ->"
bus_stop = get_stop(title)
stops << bus_stop.key
unless prev.nil? then
prev.links << encoded_riak_link(bus_stop.bucket.name, bus_stop.key, "next_" + @exid)
prev.store
end
bus_stop.store
prev = bus_stop
if n == 0 || n == max then
@line.links << encoded_riak_link(STATIONS_BUCKET, bus_stop.key, (n == 0 ? "first" : "last"))
end
end
@line.data[:timetable] = timetable
@line.data[:stops] = stops
@line.store
end
def get_stop(title)
stop = STATIONS_BUCKET.get_or_new("bus_#{title}")
if stop.data.nil? || stop.data.empty? then
stop.content_type = "application/json"
exid = "#{@exid}_#{random_string(5)}"
stop.data = {
# 'secondary' bus stops don't have their own IDs
:exid => exid,
:country => "HR",
:secondary => true,
:source => "AK"
}
puts " # Created station #{title} | #{exid}"
end
stop
end
end
def get_stations
page = Nokogiri::HTML(open("rh_stanice.html"))
puts "Stations.html loaded"
@stations = page.css(".placeleft3 .placecontent a").map do |a|
url = a[:href].split("?s=")[1].split("&d=")
title = fix_encoding(a.content.strip)
station = Riak::RObject.new(STATIONS_BUCKET, "bus_#{title}")
station.content_type = "application/json"
station.data = {
:exid => url[0],
:secondary => false,
:country => "HR",
:source => "AKZ"
}
station.store
"#{BASE_URL}/stanica.aspx?s=#{url[0]}&d=#{url[1]}"
end
puts "Stored details about #{@stations.count} station into /riak/stations"
end
get_stations()
@lines = []
def get_lines(page)
links = page.css(".cell8 a")
links.each do |a|
id = a[:href].split("lin=")[1]
next if @parsed_lines.include? id
@parsed_lines << id
BusLine.new(id)
end
end
EM::Iterator.new(@stations, 5).each do |url, iter|
data = open(url)
iter.next
get_lines(Nokogiri::HTML(data))
data = nil
end
EM.run
#!/usr/bin/env ruby
$KCODE='UTF8'
require 'rubygems'
require 'open-uri'
require 'eventmachine'
require 'nokogiri'
require 'uri'
require 'ripple'
@DB = Riak::Client.new
@LINES_BUCKET = @DB.bucket('lines')
@STATIONS_BUCKET = @DB.bucket('stations')
@TRAINS = []
@PARSED_TRAINS = []
def escape_uri(str)
URI.escape(str).
gsub("%C5%A0", "%A9"). # Š
gsub("%C5%A1", "%9A"). # š
gsub("%C4%90", "%D0"). # Đ
gsub("%C4%91", "%F0"). # đ
gsub("%C4%8C", "%C8"). # Č
gsub("%C4%8D", "%E8"). # č
gsub("%C4%86", "%C6"). # Ć
gsub("%C4%87", "%E6"). # ć
gsub("%C5%BD", "%8E"). # Ž
gsub("%C5%BE", "%9E"). # ž
gsub("%20", "+") # <space>
end
def random_string(length)
(0...length).map{ ('a'..'z').to_a[rand(26)] }.join
end
URI_REGEXP = Regexp.new("[^#{URI::PATTERN::UNRESERVED}]")
def encode_uri(uri)
URI.escape(uri, URI_REGEXP)
end
def encoded_riak_link(bucket, key, tag)
Riak::Link.new(bucket, encode_uri(key), tag)
end
class TrainLine
def initialize(url, notes)
@PARSED_LINES << url
@exid = url.split("?VL=")[0].split("&")[0]
@line = Riak::RObject.new(LINES, "train_#{@exid}")
@line.content_type = "application/json"
@line.data = {
:exid => @exid,
:notes => notes,
:source => "HZ"
}
@p = Nokogiri::HTML(open(url), nil, "windows-1250")
get_stops()
@line.store
end
def get_stops
rows = @page.css('table:eq(2) tr')
timetable = []
stops = []
prev = nil
max = rows.count
puts " O Train no.#{@exid} | #{max} stops"
rows.each_with_index do |row, n|
title, arrival, departure = row.css("td").map {|cell| cell.content.strip}[0,3]
timetable << [arrival, departure]
puts "-> #{title}@#{arrival} ->"
train_stop = get_stop(title)
stops << bus_stop.key
unless prev.nil? then
prev.links << encoded_riak_link(train_stop.bucket.name, train_stop.key, "next_" + @exid)
prev.store
end
train_stop.store
prev = train_stop
if n == 0 || n == max then
line.links << encoded_riak_link(STATIONS_BUCKET, train_stop.key, (n == 0 ? "first" : last))
end
line.data[:timetable] = timetable
line.data[:stops] = stops
line.store
end
def get_stop(title)
stop = STATIONS_BUCKET.get_or_new("bus_#{title}")
if stop.data.nil? || stop.data.empty? then
stop.content_type = "application/json"
exid = "#{@exid}_#{random_string(5)}"
stop.data = {
:exid => exid,
:country => "HR",
:source => "HZ",
:secondary => true
}
puts " # Created station #{title} | #{exid}"
end
stop
end
end
end
def get_trains(page)
return if page.css("font[color='#FF0000']").empty?
page.css("table:eq(2) tr").each do |row|
url = row.css("a")[0][:href]
TrainLine.new(url, row.css("td").last.content.strip) unless @PARSED_LINES.include?(url)
end
nil
end
def get_terminals()
p = Nokogiri::HTML(open('http://vred.hznet.hr/hzinfo/?category=hzinfo&service=izvr3'), nil, "windows-1250")
p.css("select[name=KO] option").map do |terminal|
term = escape_uri(terminal.content.strip)
terminal = Riak::RObject.new(STATIONS_BUCKET, "train_" + terminal.content.strip)
terminal.content_type = "application/json"
terminal.data = {
:exid => term,
:secondary => false,
:country => "HR",
:source => "HZ"
}
terminal.store
"http://vred.hznet.hr/hzinfo/Default.asp?KO=#{term}&Category=hzinfo&Service=izvr3&LANG=HR&DL=1&OL=2&SCREEN=2"
end
end
EM::Iterator.new(get_terminals(), 5).each do |url, iter|
puts url
data = open(url)
iter.next
get_trains(Nokogiri::HTML(data))
data = nil
end
EM.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment