Created
July 27, 2009 20:14
-
-
Save hsribei/156713 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'rubygems' | |
require 'mechanize' | |
require 'ostruct' | |
require 'open-uri' | |
require 'hpricot' | |
require 'yaml' | |
# fix for screwed up net/http | |
# found at: http://pw.tech-arts.co.jp/technical/cat57/ | |
module Net #:nodoc: | |
class HTTPResponse | |
class << HTTPResponse | |
def each_response_header(sock) | |
pm = ["", ""] | |
while true | |
line = sock.readuntil("\n", true).sub(/\s+\z/, '') | |
break if line.empty? | |
m = /\A([^:]+):\s*/.match(line) | |
if m.nil? | |
pm[1] += line | |
next | |
end | |
yield pm[0], pm[1] | |
pm = [m[1], m.post_match] | |
end | |
end | |
end | |
end # HTTPResponse | |
end # module Net | |
# from http://railsruby.blogspot.com/2006/07/url-escape-and-url-unescape.html | |
def url_escape(string) | |
string.gsub(/([^ a-zA-Z0-9_.-]+)/n) do | |
'%' + $1.unpack('H2' * $1.size).join('%').upcase | |
end.tr(' ', '+') | |
end | |
agent = WWW::Mechanize.new | |
legislators = [] | |
# we're keeping a file just with the list of legislators so we don't need | |
# to read it again every time we're testing the extraction of their | |
# voting history (which is still buggy) | |
if File.exists?("legislators.yml") | |
legislators = File.open("legislators.yml") { |f| YAML::load(f) } | |
else | |
page = agent.get('http://www2.camara.gov.br/deputados') | |
form = page.form('form1') | |
select = form.fields[5] | |
legislators = [] | |
# the first option is just the label | |
select.options[1..-1].each do |option| | |
legislator = OpenStruct.new | |
legislator.name = option.instance_eval("@text") | |
option.instance_eval("@value") =~ /\|(\d+)%/ | |
legislator.leg_id = $1 | |
option.instance_eval("@value") =~ /%(\d+)!/ | |
legislator.mat = $1 | |
option.instance_eval("@value") =~ /\!(.*)=/ | |
legislator.uf = $1 | |
option.instance_eval("@value") =~ /=(.*)/ | |
legislator.part = $1 | |
# we're trying to build this url: | |
# http://www.camara.gov.br/internet/deputado/deputado_atual_resp.asp?fMode=1&deputado=ABELARDO%20CAMARINHA%7C528580%2523329%21SP%3DPSB&Pesquisa=Pesquisar&rbDeputado=VP&DepID=528580&DepUF=SP&DepMat=23329&DepPart=PSB | |
legislator.site_metadata = OpenStruct.new | |
legislator.site_metadata.select_value = option.instance_eval("@value") | |
legislator.site_metadata.votings_url = "http://www.camara.gov.br/internet/deputado/deputado_atual_resp.asp?fMode=1&deputado=#{url_escape(legislator.site_metadata.select_value)}&Pesquisa=Pesquisar&rbDeputado=VP&DepID=#{legislator.leg_id}&DepUF=#{legislator.uf}&DepMat=#{legislator.mat}&DepPart=#{legislator.part}" | |
puts legislator.name | |
legislators << legislator | |
end | |
YAML::dump(legislators, File.open("legislators.yml", "w")) | |
end | |
field_names = ["legislator_id", "plenary_session", "date", "proposition_title", "legislator_present", "vote"] | |
File.open("votings.csv", "w") { |f| f << field_names.map { |field| "\"#{field}\""}.join(",") + "\n" } | |
# legislators_to_rip = legislators.select { |d| d.votings.nil? } | |
legislators_to_rip = legislators | |
legislators_to_rip.each_with_index do |legislator, i| | |
doc = Hpricot(`links -source "#{legislator.site_metadata.votings_url}" | iconv -f ISO_8859-1 -t UTF-8`) | |
rows = (doc / "tr")[1..-1] | |
legislator.votings = [] | |
puts "(%d/%d) Scraping votings for #{legislator.name}" % [i+1, legislators_to_rip.size] | |
rows.each do |row| | |
begin | |
unless row.nil? | |
fields = (row / "td").map { |f| f.innerText.strip} | |
if fields.size == 5 | |
# voting = {} | |
# voting['Sessao Plenaria'] = fields[0] | |
# voting['Data'] = fields[1] | |
# voting['Votacao'] = fields[2] | |
# voting['Frequencia na Sessao'] = fields[3][0...-1] #exclude at the end | |
# voting['Voto'] = fields[4] | |
# legislator.votings << voting | |
fields[3] = fields[3][0...-1] | |
voting = [legislator.leg_id.to_s] + fields.map { |field| "\"#{field}\""} | |
File.open("votings.csv", "a") { |f| f << voting.join(",") + "\n" } | |
print '.' # progress indicator | |
end | |
end | |
rescue Exception => e | |
puts "EXCEPTION while scraping votins for #{legislator.name}: " | |
puts e.inspect | |
exit | |
end | |
end | |
puts "\n" | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment