public
Created

Web Scraping NICAR Python class

  • Download Gist
part1.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
import requests
import csv
from pyquery import PyQuery as pq
 
f = open('strike_all_bills.csv', 'w')
wr = csv.DictWriter(f, ['billno'])
 
req = requests.get('http://www.azleg.gov/StrikeEverything.asp')
html = pq(req.text)
rows = html('tr.TableHeaderBackground').siblings()
for row in rows:
datarow = {}
datarow['billno'] = pq(row).children()[0].text.strip()
datarow['docurl'] = pq(row).children().children('a').attr('href')
billreq = requests.get(datarow['docurl'])
billhtml = pq(billreq.text)
wr.writerow(datarow)
part2.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
import requests
import csv
from pyquery import PyQuery as pq
 
postvars = {
'dbyear':12,
'cancom':2,
'name':"OBAMA",
}
f = open('obamafec.csv', 'w')
wr = csv.DictWriter(f, ['committee_name', 'receipts', 'disb', 'cash', 'debt', 'through'])
 
req = requests.post('http://query.nictusa.com/cgi-bin/cancomsrs/', postvars)
html = pq(req.text)
committee_table = pq(html('table')[1])
rows = committee_table('tr')
for row in rows:
row = pq(row)
if len(row.children()) == 6:
datarow = {}
# we have a row with values, not a title
datarow['committee_name'] = pq(row.children()[0]).text()
datarow['receipts'] = pq(row.children()[1]).text()
datarow['disb'] = pq(row.children()[2]).text()
datarow['cash'] = pq(row.children()[3]).text()
datarow['debt'] = pq(row.children()[4]).text()
datarow['through'] = pq(row.children()[5]).text()
wr.writerow(datarow)

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.