Created
March 25, 2009 05:07
-
-
Save hsribei/85300 to your computer and use it in GitHub Desktop.
Script para ripar nomes, partidos, estados e votações de deputados federais.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# to read it again every time we're testing the extraction of their | |
# voting history (which is still buggy) | |
if File.exists?("legislators.yml") | |
legislators = File.open("legislators.yml") { |f| YAML::load(f) } | |
else | |
page = agent.get('http://www2.camara.gov.br/deputados') | |
form = page.form('form1') | |
select = form.fields[5] | |
legislators = [] | |
# the first option is just the label | |
select.options[1..-1].each do |option| | |
legislator = OpenStruct.new | |
legislator.name = option.instance_eval("@text") | |
option.instance_eval("@value") =~ /\|(\d+)%/ | |
legislator.leg_id = $1 | |
option.instance_eval("@value") =~ /%(\d+)!/ | |
legislator.mat = $1 | |
option.instance_eval("@value") =~ /\!(.*)=/ | |
legislator.uf = $1 | |
option.instance_eval("@value") =~ /=(.*)/ | |
legislator.part = $1 | |
# we're trying to build this url: | |
# http://www.camara.gov.br/internet/deputado/deputado_atual_resp.asp?fMode=1&deputado=ABELARDO%20CAMARINHA%7C528580%2523329%21SP%3DPSB&Pesquisa=Pesquisar&rbDeputado=VP&DepID=528580&DepUF=SP&DepMat=23329&DepPart=PSB | |
legislator.site_metadata = OpenStruct.new | |
legislator.site_metadata.select_value = option.instance_eval("@value") | |
legislator.site_metadata.votings_url = "http://www.camara.gov.br/internet/deputado/deputado_atual_resp.asp?fMode=1&deputado=#{url_escape(legislator.site_metadata.select_value)}&Pesquisa=Pesquisar&rbDeputado=VP&DepID=#{legislator.leg_id}&DepUF=#{legislator.uf}&DepMat=#{legislator.mat}&DepPart=#{legislator.part}" | |
puts legislator.name | |
legislators << legislator | |
end | |
YAML::dump(legislators, File.open("legislators.yml", "w")) | |
end | |
field_names = ["legislator_id", "plenary_session", "date", "proposition_title", "legislator_present", "vote"] | |
File.open("votings.csv", "w") { |f| f << field_names.map { |field| "\"#{field}\""}.join(",") + "\n" } | |
# legislators_to_rip = legislators.select { |d| d.votings.nil? } | |
legislators_to_rip = legislators | |
legislators_to_rip.each_with_index do |legislator, i| | |
doc = Hpricot(`links -source "#{legislator.site_metadata.votings_url}" | iconv -f ISO_8859-1 -t UTF-8`) | |
rows = (doc / "tr")[1..-1] | |
legislator.votings = [] | |
puts "(%d/%d) Scraping votings for #{legislator.name}" % [i+1, legislators_to_rip.size] | |
rows.each do |row| | |
begin | |
unless row.nil? | |
fields = (row / "td").map { |f| f.innerText.strip} | |
if fields.size == 5 | |
# voting = {} | |
# voting['Sessao Plenaria'] = fields[0] | |
# voting['Data'] = fields[1] | |
# voting['Votacao'] = fields[2] | |
# voting['Frequencia na Sessao'] = fields[3][0...-1] #exclude at the end | |
# voting['Voto'] = fields[4] | |
# legislator.votings << voting | |
fields[3] = fields[3][0...-1] | |
voting = [legislator.leg_id.to_s] + fields.map { |field| "\"#{field}\""} | |
File.open("votings.csv", "a") { |f| f << voting.join(",") + "\n" } | |
print '.' # progress indicator | |
end | |
end | |
rescue Exception => e | |
puts "EXCEPTION while scraping votins for #{legislator.name}: " | |
puts e.inspect | |
exit | |
end | |
end | |
puts "\n" | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment