Skip to content

Instantly share code, notes, and snippets.

@markng
Created February 25, 2012 04:34
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save markng/1906477 to your computer and use it in GitHub Desktop.
Save markng/1906477 to your computer and use it in GitHub Desktop.
Web Scraping NICAR Python class
import requests
import csv
from pyquery import PyQuery as pq
f = open('strike_all_bills.csv', 'w')
wr = csv.DictWriter(f, ['billno'])
req = requests.get('http://www.azleg.gov/StrikeEverything.asp')
html = pq(req.text)
rows = html('tr.TableHeaderBackground').siblings()
for row in rows:
datarow = {}
datarow['billno'] = pq(row).children()[0].text.strip()
datarow['docurl'] = pq(row).children().children('a').attr('href')
billreq = requests.get(datarow['docurl'])
billhtml = pq(billreq.text)
wr.writerow(datarow)
import requests
import csv
from pyquery import PyQuery as pq
postvars = {
'dbyear':12,
'cancom':2,
'name':"OBAMA",
}
f = open('obamafec.csv', 'w')
wr = csv.DictWriter(f, ['committee_name', 'receipts', 'disb', 'cash', 'debt', 'through'])
req = requests.post('http://query.nictusa.com/cgi-bin/cancomsrs/', postvars)
html = pq(req.text)
committee_table = pq(html('table')[1])
rows = committee_table('tr')
for row in rows:
row = pq(row)
if len(row.children()) == 6:
datarow = {}
# we have a row with values, not a title
datarow['committee_name'] = pq(row.children()[0]).text()
datarow['receipts'] = pq(row.children()[1]).text()
datarow['disb'] = pq(row.children()[2]).text()
datarow['cash'] = pq(row.children()[3]).text()
datarow['debt'] = pq(row.children()[4]).text()
datarow['through'] = pq(row.children()[5]).text()
wr.writerow(datarow)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment