Skip to content

Instantly share code, notes, and snippets.

@hsribei
Created July 27, 2009 20:14
Show Gist options
  • Save hsribei/156713 to your computer and use it in GitHub Desktop.
Save hsribei/156713 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
require 'rubygems'
require 'mechanize'
require 'ostruct'
require 'open-uri'
require 'hpricot'
require 'yaml'
# fix for screwed up net/http
# found at: http://pw.tech-arts.co.jp/technical/cat57/
module Net #:nodoc:
class HTTPResponse
class << HTTPResponse
def each_response_header(sock)
pm = ["", ""]
while true
line = sock.readuntil("\n", true).sub(/\s+\z/, '')
break if line.empty?
m = /\A([^:]+):\s*/.match(line)
if m.nil?
pm[1] += line
next
end
yield pm[0], pm[1]
pm = [m[1], m.post_match]
end
end
end
end # HTTPResponse
end # module Net
# from http://railsruby.blogspot.com/2006/07/url-escape-and-url-unescape.html
def url_escape(string)
string.gsub(/([^ a-zA-Z0-9_.-]+)/n) do
'%' + $1.unpack('H2' * $1.size).join('%').upcase
end.tr(' ', '+')
end
agent = WWW::Mechanize.new
legislators = []
# we're keeping a file just with the list of legislators so we don't need
# to read it again every time we're testing the extraction of their
# voting history (which is still buggy)
if File.exists?("legislators.yml")
legislators = File.open("legislators.yml") { |f| YAML::load(f) }
else
page = agent.get('http://www2.camara.gov.br/deputados')
form = page.form('form1')
select = form.fields[5]
legislators = []
# the first option is just the label
select.options[1..-1].each do |option|
legislator = OpenStruct.new
legislator.name = option.instance_eval("@text")
option.instance_eval("@value") =~ /\|(\d+)%/
legislator.leg_id = $1
option.instance_eval("@value") =~ /%(\d+)!/
legislator.mat = $1
option.instance_eval("@value") =~ /\!(.*)=/
legislator.uf = $1
option.instance_eval("@value") =~ /=(.*)/
legislator.part = $1
# we're trying to build this url:
# http://www.camara.gov.br/internet/deputado/deputado_atual_resp.asp?fMode=1&deputado=ABELARDO%20CAMARINHA%7C528580%2523329%21SP%3DPSB&Pesquisa=Pesquisar&rbDeputado=VP&DepID=528580&DepUF=SP&DepMat=23329&DepPart=PSB
legislator.site_metadata = OpenStruct.new
legislator.site_metadata.select_value = option.instance_eval("@value")
legislator.site_metadata.votings_url = "http://www.camara.gov.br/internet/deputado/deputado_atual_resp.asp?fMode=1&deputado=#{url_escape(legislator.site_metadata.select_value)}&Pesquisa=Pesquisar&rbDeputado=VP&DepID=#{legislator.leg_id}&DepUF=#{legislator.uf}&DepMat=#{legislator.mat}&DepPart=#{legislator.part}"
puts legislator.name
legislators << legislator
end
YAML::dump(legislators, File.open("legislators.yml", "w"))
end
field_names = ["legislator_id", "plenary_session", "date", "proposition_title", "legislator_present", "vote"]
File.open("votings.csv", "w") { |f| f << field_names.map { |field| "\"#{field}\""}.join(",") + "\n" }
# legislators_to_rip = legislators.select { |d| d.votings.nil? }
legislators_to_rip = legislators
legislators_to_rip.each_with_index do |legislator, i|
doc = Hpricot(`links -source "#{legislator.site_metadata.votings_url}" | iconv -f ISO_8859-1 -t UTF-8`)
rows = (doc / "tr")[1..-1]
legislator.votings = []
puts "(%d/%d) Scraping votings for #{legislator.name}" % [i+1, legislators_to_rip.size]
rows.each do |row|
begin
unless row.nil?
fields = (row / "td").map { |f| f.innerText.strip}
if fields.size == 5
# voting = {}
# voting['Sessao Plenaria'] = fields[0]
# voting['Data'] = fields[1]
# voting['Votacao'] = fields[2]
# voting['Frequencia na Sessao'] = fields[3][0...-1] #exclude &nbsp; at the end
# voting['Voto'] = fields[4]
# legislator.votings << voting
fields[3] = fields[3][0...-1]
voting = [legislator.leg_id.to_s] + fields.map { |field| "\"#{field}\""}
File.open("votings.csv", "a") { |f| f << voting.join(",") + "\n" }
print '.' # progress indicator
end
end
rescue Exception => e
puts "EXCEPTION while scraping votins for #{legislator.name}: "
puts e.inspect
exit
end
end
puts "\n"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment