Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
download lobbying disclosures from US House of Representatives
#!/usr/bin/env python
# Copyright (c) 2009, Jeremy Carbaugh
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * Neither the name of Jeremy Carbaugh, Sunlight Foundation, Sunlight Labs
# nor the names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
# THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import httplib
import urllib, urllib2
import os
import re
HTTP_HOST = 'disclosures.house.gov'
HTTP_PATH = '/ld/LDDownload.aspx'
def get_context():
content = urllib2.urlopen('http://%s%s' % (HTTP_HOST, HTTP_PATH)).read()
disc_re = re.compile(r'<option value="(?P<value>.*?)">(?P<description>.*?)</option>')
disclosures = disc_re.findall(content)
vs_re = re.compile(r'<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="(?P<viewstate>.*?)" />')
viewstate = vs_re.findall(content)[0]
ev_re = re.compile(r'<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="(?P<evalidation>.*?)" />')
evalidation = ev_re.findall(content)[0]
return {
'viewstate': viewstate,
'evalidation': evalidation,
'disclosures': [(d[0], '%s.zip' % d[0].split('(')[0].replace(' ','')) for d in disclosures]
}
def download_disclosures(context):
if not os.path.exists('ld_download'):
os.mkdir('ld_download')
headers = { "Content-type": "application/x-www-form-urlencoded" }
for disclosure in context['disclosures']:
conn = httplib.HTTPConnection(HTTP_HOST)
value = disclosure[0]
filename = disclosure[1]
params = {
'selFilesXML': value,
'__VIEWSTATE': context['viewstate'],
'__EVENTVALIDATION': context['evalidation'],
'btnDownloadXML': 'Download'
}
conn.request('POST', HTTP_PATH, urllib.urlencode(params), headers)
response = conn.getresponse()
content_length = float(response.getheader('Content-Length', 0))
kb = content_length / 1024.0
if kb < 1024.0:
filesize = "%0.1fKB" % kb
else:
filesize = "%0.1fMB" % (kb / 1024.0,)
print '[%s] %s (%s' % (filesize, filename, value.split('(')[1].strip())
open(os.path.join('ld_download', filename), 'w').write(response.read())
conn.close()
print 'Done!'
if __name__ == '__main__':
download_disclosures(get_context())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.