Created
October 15, 2013 10:31
-
-
Save vdavez/6989612 to your computer and use it in GitHub Desktop.
Can anyone figure out why this doesn't work? A sample record of the JSON data referred to: { "description": "Opinion", "url": "GetDoc.asp?Database=CAB_DOCS&docnum=25884&version=1&minLevel=0", "date_filed": "10/9/2013", "case_number": "P-0943", "file_size": "48222", "row_id": "0" },
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import mechanize | |
import cookielib | |
import json | |
import urllib | |
#initialize outfile | |
out = open('glob.html', 'w') | |
# Open the browser and initialize the cookie jar | |
br = mechanize.Browser() | |
cj = cookielib.CookieJar() | |
br.set_cookiejar(cj) | |
# Browser options | |
br.set_handle_equiv(True) | |
br.set_handle_gzip(True) | |
br.set_handle_redirect(True) | |
br.set_handle_referer(True) | |
br.set_handle_robots(False) | |
# Follows refresh 0 but not hangs on refresh > 0 | |
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) | |
# Want debugging messages? | |
br.set_debug_http(True) | |
br.set_debug_redirects(True) | |
br.set_debug_responses(True) | |
# User-Agent (this is cheating, ok?) | |
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] | |
br.open('http://app.cab.dc.gov/WorkSite/Published_Board_Decisions.asp') | |
br.select_form(name="TheForm") | |
json_data = open('blob.json') | |
data = json.load(json_data) | |
for i in data: | |
date = i.get('date_filed').replace(r'/','-') | |
file_name = 'opinions/' + i.get('case_number') + '_' + date | |
doc_url = 'http://app.cab.dc.gov/WorkSite/' + i.get('url') | |
# This is the part that I need to figure out... | |
br.retrieve(doc_url, file_name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment