Created
February 14, 2013 15:14
-
-
Save chadbrewbaker/4953455 to your computer and use it in GitHub Desktop.
ICIS scraping template from StartupWeekend
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'json' | |
require 'mechanize' | |
require 'json' | |
require 'date' | |
require 'net/http' | |
require 'net/https' | |
require 'csv' | |
module IowaCourts | |
#IOWA_COURTS_SPREADSHEET = '/spreadsheet/pub?key=0Ao6S2MATTJSMdE1iTE1lc1hDNk9mdndHWWVOczl0b3c&single=true&gid=0&output=csv' | |
# IOWA_COURTS_SPREADSHEET = '/spreadsheet/pub?key=0AjE_z4zpAEzVdE51ZDZMQTB1N3c1VU96SDVkcXhIbXc&single=true&gid=0&output=csv' | |
IOWA_COURTS_SPREADSHEET = '/spreadsheet/pub?key=3D0AjE_z4zpAEzVdE51ZDZMQTB1N3c1VU96SDVkcXhIbXc&single=true&gid=0&output=csv' | |
def IowaCourts.get_cases() | |
pp 'howdy' | |
return [['Phil','F' ,'Elgin' ]] | |
# https = Net::HTTP.new('docs.google.com', 443) | |
# req = Net::HTTP::Get.new(IOWA_COURTS_SPREADSHEET) | |
# https.use_ssl = true | |
# https.verify_mode = OpenSSL::SSL::VERIFY_NONE | |
# response = https.request(req) | |
# pp response.body | |
# return CSV.parse(response.body)[1..-1].collect do |row| | |
# first = row[1] | |
# middle = row[2] | |
# last = row[3] | |
# pp first + middle + last | |
# [first, middle, last] | |
#end | |
end | |
def IowaCourts.scrape(first, middle, last) | |
begin | |
agent = Mechanize.new | |
agent.verify_mode = OpenSSL::SSL::VERIFY_NONE | |
agent.history_added = Proc.new {sleep 0.1} | |
pp 'first is '+ first | |
pp 'middle is '+ middle | |
# get started by visiting the login page, so the server thinks we're human | |
# and will serve us HTML (*evil grin*) | |
page = agent.get('https://www.iowacourts.state.ia.us/ESAWebApp/ESALogin.jsp') | |
# go to the search form | |
page = agent.get('https://www.iowacourts.state.ia.us/ESAWebApp/TrialCourtStateWide') | |
# enter our query: county and name | |
namesearch = page.form('TrailCourtStateWide') | |
#namesearch.field('county').option_with(:text => county.upcase).select | |
namesearch.last = (last or '').downcase | |
#namesearch.last = (last or '').downcase | |
#namesearch.first = (first or '').downcase | |
namesearch.first = (first or '').downcase | |
namesearch.middle = (middle or '').downcase | |
begin | |
# submit search form | |
page = agent.submit(namesearch, namesearch.buttons.first) | |
rescue Exception=>e | |
pp "e69 " +e | |
end | |
#pp page.links.size | |
caselist = {} | |
page.search('table')[0].search('tr').each do |row| | |
cells = row.search('td') | |
# make sure we have the kind of row we need | |
next if cells.length < 2 | |
link = cells[0].search('a') | |
next unless link.length > 0 | |
# matches links for a URL like javascript:mySubmit('02121++EQCV020477','BT1117135') | |
case_id, person_id = link[0].attribute('href').to_s.scan(/.+\('(.+)','(.+)'/)[0] | |
# unless case_id.empty? | |
# next unless case_id.include?(match_case_id) | |
#end | |
caselist[person_id] ||= {} | |
#pp "hh "+cells | |
caselist[person_id][case_id] = { | |
#:type => case_id[7,9], | |
#:person_id => person_id, | |
#:person => cells[2].text, | |
#:county => county, | |
#:title => cells[1].text, | |
#:role => cells[4].text, | |
:initiated => nil, | |
:disposition => nil, | |
:charges => [], | |
:checked => Date.today.to_s, | |
:filings => [] | |
} | |
end | |
#pp caselist | |
trialform = page.form('TrialForm') | |
caselist.each do |person_id, cases| | |
active_case = nil | |
cases.each do |case_id, courtcase| | |
active_case = courtcase | |
# at this point we get zero or more cases in a table as well as | |
# citation number; subtype (murder etc.); name; role; initiated date; disposition | |
# find the case ID's of each of those cases | |
trialform.field_with(:name => 'caseid').value = case_id | |
# click through on case and the "banner" frame | |
# (the banner frame is where we want to click on "filings" in the navigation bar) | |
begin | |
trial = agent.submit(trialform) | |
rescue Exception=>e | |
pp "e225 " +e | |
end | |
begin | |
trial = trial.frame_with(:name => 'banner').click() | |
rescue Exception=>e | |
pp "130 " +e | |
end | |
begin | |
filings = trial.link_with(:text => 'Filings').click() | |
rescue Exception=>e | |
pp "e137 " +e | |
end | |
#pp filings | |
#skip labels | |
if(filings == nil) | |
next | |
end | |
rows = filings.search('table')[0].search('tr') | |
#read first values | |
rows = filings.search('table')[0].search('tr') | |
rows[2..-1].each do |row| | |
cells = row.search('td') | |
if cells.length > 2 | |
print cells[0].text + "\n" | |
print cells[1].text + "\n" | |
print cells[2].text + "\n" | |
active_case[:filings].push({ | |
:event => cells[0].text.strip, | |
:filed_by => cells[1].text.strip, | |
:filed => cells[2].text.strip, | |
:created => cells[3].text.strip, | |
:updated => cells[4].text.strip, | |
:comments => [] | |
}) | |
else | |
comment = cells[0].text or '' | |
last_filing = active_case[:filings][-1] | |
if last_filing | |
last_filing[:comments].push comment.strip | |
end | |
end | |
end | |
# find out about charges and adjudication | |
# charges = trial.link_with(:text => 'Criminal Charges/Disposition').click() | |
# cells = charges.search('table')[0].search('td') | |
# cells.each_with_index do |cell, i| | |
# active_charge = {} | |
# if cell.text.include? 'Count 0' | |
# active_case[:charges].push active_charge | |
# end | |
# text = cell.text.gsub(/\s+/, '') | |
# value = cell.next().text.strip | |
# #puts "`#{text}` - `#{value}`" | |
# case text | |
# when 'Description:' | |
# active_charge[:charge] = value if value | |
# when 'Adj.:' | |
# active_charge[:adjudication] = value | |
# when 'SentenceDate:' | |
# active_charge[:sentence_date] = value | |
# when 'Sentence:' | |
# active_charge[:sentence] = value | |
# when 'Appeal:' | |
# active_charge[:appeal] = value | |
# end | |
# end | |
end | |
end | |
return caselist | |
rescue Exception=>e | |
pp "e207 " +e | |
puts e.backtrace | |
end | |
end | |
def IowaCourts.scrape_all() | |
errors = [] | |
cases = IowaCourts::get_cases.collect do |first, middle, last| | |
pp first + middle + last | |
begin | |
IowaCourts::scrape(first, middle, last) | |
rescue => error | |
errors.push({:description => "Scraping error for #{first}, #{middle}, #{last | |
}"}) | |
nil | |
end | |
end | |
return [cases.compact(), errors] | |
end | |
def IowaCourts.clear() | |
ScraperWiki.sqliteexecute("DROP TABLE IF EXISTS filings") | |
ScraperWiki.sqliteexecute("DROP TABLE IF EXISTS errors") | |
end | |
def IowaCourts.save(data) | |
data.each do |caselist| | |
caselist.each do |person_id, cases| | |
cases.each do |case_id, courtcase| | |
pp case_id | |
courtcase[:filings].each do |filing| | |
data = { | |
:id => rand(36**8), | |
:case_id => case_id, | |
:person_id => courtcase[:person_id], | |
:case => courtcase[:title], | |
:person => courtcase[:person], | |
:filing => filing[:event], | |
:filed_by => filing[:filed_by], | |
:date => Date.strptime(filing[:filed], '%m/%d/%Y'), | |
:comments => '' | |
} | |
while filing[:comments].length > 0 | |
data[:comments] += filing[:comments].shift() | |
end | |
if (Date.today - data[:date]).to_i < 8 | |
ScraperWiki.save_sqlite(unique_keys=['id'], data, table_name="filings") | |
end | |
end | |
end | |
end | |
end | |
end | |
def IowaCourts.log(errors) | |
ScraperWiki.save_sqlite(unique_keys=['description'], errors, table_name="errors") | |
end | |
end | |
cases, errors = IowaCourts::scrape_all() | |
if defined? ScraperWiki | |
IowaCourts::clear() | |
IowaCourts::save(cases) | |
IowaCourts::log(errors) | |
else | |
puts JSON.pretty_generate cases | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment