Created
November 20, 2014 20:47
-
-
Save bjornarneson/2396ac73285cd3b795dd to your computer and use it in GitHub Desktop.
Scrape fiscal notes from Minnesota Management and Budget website to local directory
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# modules we're using | |
# (be sure that lxml is in your python installation) | |
import lxml.html, urllib2, urlparse, os, shutil | |
# construct a namespace dictionary to pass to the xpath() call | |
# this lets us use regular expressions in the xpath | |
ns = {'re': 'http://exslt.org/regular-expressions'} | |
# build dictionary of fiscal note search pages | |
d = { | |
# DIR: base_url | |
'2015-16': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2015-16' | |
# '2013-14': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2013-14', | |
# '2011-12': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2011-12', | |
# '2009-10': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2009-10', | |
# '2007-08': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2007-08', | |
# '2005-06': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2005-06', | |
# '2003-04': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2003-04', | |
# '2001-02': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2001-02', | |
# '1999-00': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=1999-00', | |
# '1997-98': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=1997-98', | |
# '1995-96': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=1995-96', | |
} | |
for DIR, base_url in d.iteritems(): | |
# fetch the page | |
res = urllib2.urlopen(base_url) | |
# parse the response into an xml tree | |
tree = lxml.html.fromstring(res.read()) | |
# iterate over all <a> tags whose href ends in ".pdf" (case-insensitive) | |
for node in tree.xpath('//a[re:test(@href, "\.pdf$", "i")]', namespaces=ns): | |
# print the href, joining it to the base_url | |
url = urlparse.urljoin(base_url, node.attrib['href']) | |
filename = os.path.basename(node.attrib['href']) | |
if not os.path.isdir(DIR): # if the directory doesn't exist, create it | |
os.makedirs(DIR) | |
print('creating new directory:', DIR) | |
if not os.path.exists(os.path.join(DIR, filename)): # don't overwrite existing files | |
req = urllib2.urlopen(url) | |
with open(os.path.join(DIR, filename), 'wb') as fp: | |
shutil.copyfileobj(req, fp) | |
print('saving', filename) | |
else: | |
print(filename, 'already exists in', DIR) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment