Skip to content

Instantly share code, notes, and snippets.

@gregelin
Forked from jcarbaugh/lddl.py
Created May 14, 2009 19:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gregelin/111835 to your computer and use it in GitHub Desktop.
Save gregelin/111835 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import httplib
import urllib, urllib2
import os
import re
HTTP_HOST = 'disclosures.house.gov'
HTTP_PATH = '/ld/LDDownload.aspx'
def get_context():
content = urllib2.urlopen('http://%s%s' % (HTTP_HOST, HTTP_PATH)).read()
disc_re = re.compile(r'<option value="(?P<value>.*?)">(?P<description>.*?)</option>')
disclosures = disc_re.findall(content)
vs_re = re.compile(r'<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="(?P<viewstate>.*?)" />')
viewstate = vs_re.findall(content)[0]
ev_re = re.compile(r'<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="(?P<evalidation>.*?)" />')
evalidation = ev_re.findall(content)[0]
return {
'viewstate': viewstate,
'evalidation': evalidation,
'disclosures': [(d[0], '%s.zip' % d[0].split('(')[0].replace(' ','')) for d in disclosures]
}
def download_disclosures(context):
if not os.path.exists('ld_download'):
os.mkdir('ld_download')
headers = { "Content-type": "application/x-www-form-urlencoded" }
for disclosure in context['disclosures']:
conn = httplib.HTTPConnection(HTTP_HOST)
value = disclosure[0]
filename = disclosure[1]
params = {
'selFilesXML': value,
'__VIEWSTATE': context['viewstate'],
'__EVENTVALIDATION': context['evalidation'],
'btnDownloadXML': 'Download'
}
conn.request('POST', HTTP_PATH, urllib.urlencode(params), headers)
response = conn.getresponse()
content_length = float(response.getheader('Content-Length', 0))
kb = content_length / 1024.0
if kb < 1024.0:
filesize = "%0.1fKB" % kb
else:
filesize = "%0.1fMB" % (kb / 1024.0,)
print '[%s] %s (%s' % (filesize, filename, value.split('(')[1].strip())
open(os.path.join('ld_download', filename), 'w').write(response.read())
conn.close()
print 'Done!'
if __name__ == '__main__':
download_disclosures(get_context())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment