Skip to content

Instantly share code, notes, and snippets.

@bjornarneson
Created November 20, 2014 20:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bjornarneson/2396ac73285cd3b795dd to your computer and use it in GitHub Desktop.
Save bjornarneson/2396ac73285cd3b795dd to your computer and use it in GitHub Desktop.
Scrape fiscal notes from Minnesota Management and Budget website to local directory
# modules we're using
# (be sure that lxml is in your python installation)
import lxml.html, urllib2, urlparse, os, shutil
# construct a namespace dictionary to pass to the xpath() call
# this lets us use regular expressions in the xpath
ns = {'re': 'http://exslt.org/regular-expressions'}
# build dictionary of fiscal note search pages
d = {
# DIR: base_url
'2015-16': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2015-16'
# '2013-14': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2013-14',
# '2011-12': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2011-12',
# '2009-10': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2009-10',
# '2007-08': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2007-08',
# '2005-06': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2005-06',
# '2003-04': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2003-04',
# '2001-02': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2001-02',
# '1999-00': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=1999-00',
# '1997-98': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=1997-98',
# '1995-96': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=1995-96',
}
for DIR, base_url in d.iteritems():
# fetch the page
res = urllib2.urlopen(base_url)
# parse the response into an xml tree
tree = lxml.html.fromstring(res.read())
# iterate over all <a> tags whose href ends in ".pdf" (case-insensitive)
for node in tree.xpath('//a[re:test(@href, "\.pdf$", "i")]', namespaces=ns):
# print the href, joining it to the base_url
url = urlparse.urljoin(base_url, node.attrib['href'])
filename = os.path.basename(node.attrib['href'])
if not os.path.isdir(DIR): # if the directory doesn't exist, create it
os.makedirs(DIR)
print('creating new directory:', DIR)
if not os.path.exists(os.path.join(DIR, filename)): # don't overwrite existing files
req = urllib2.urlopen(url)
with open(os.path.join(DIR, filename), 'wb') as fp:
shutil.copyfileobj(req, fp)
print('saving', filename)
else:
print(filename, 'already exists in', DIR)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment