bjornarneson/scrape_fiscal_notes_v2.py

## scrape_fiscal_notes_v2.py
# modules we're using
# (be sure that lxml is in your python installation)
import lxml.html, urllib2, urlparse, os, shutil

# construct a namespace dictionary to pass to the xpath() call
# this lets us use regular expressions in the xpath
ns = {'re': 'http://exslt.org/regular-expressions'}

# build dictionary of fiscal note search pages
d = {
    # DIR: base_url
    '2015-16': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2015-16'
    # '2013-14': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2013-14',
    # '2011-12': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2011-12',
    # '2009-10': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2009-10',
    # '2007-08': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2007-08',
    # '2005-06': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2005-06',
    # '2003-04': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2003-04',
    # '2001-02': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2001-02',
    # '1999-00': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=1999-00',
    # '1997-98': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=1997-98',
    # '1995-96': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=1995-96',
}

for DIR, base_url in d.iteritems():
    # fetch the page
    res = urllib2.urlopen(base_url)
    # parse the response into an xml tree
    tree = lxml.html.fromstring(res.read())
    # iterate over all <a> tags whose href ends in ".pdf" (case-insensitive)
    for node in tree.xpath('//a[re:test(@href, "\.pdf$", "i")]', namespaces=ns):
        # print the href, joining it to the base_url
        url = urlparse.urljoin(base_url, node.attrib['href'])
        filename = os.path.basename(node.attrib['href'])
        if not os.path.isdir(DIR): # if the directory doesn't exist, create it
            os.makedirs(DIR)
            print('creating new directory:', DIR)
        if not os.path.exists(os.path.join(DIR, filename)): # don't overwrite existing files
            req = urllib2.urlopen(url)
            with open(os.path.join(DIR, filename), 'wb') as fp:
                shutil.copyfileobj(req, fp)
                print('saving', filename)
        else:
            print(filename, 'already exists in', DIR)
	# modules we're using
	# (be sure that lxml is in your python installation)
	import lxml.html, urllib2, urlparse, os, shutil

	# construct a namespace dictionary to pass to the xpath() call
	# this lets us use regular expressions in the xpath
	ns = {'re': 'http://exslt.org/regular-expressions'}

	# build dictionary of fiscal note search pages
	d = {
	# DIR: base_url
	'2015-16': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2015-16'
	# '2013-14': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2013-14',
	# '2011-12': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2011-12',
	# '2009-10': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2009-10',
	# '2007-08': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2007-08',
	# '2005-06': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2005-06',
	# '2003-04': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2003-04',
	# '2001-02': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=2001-02',
	# '1999-00': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=1999-00',
	# '1997-98': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=1997-98',
	# '1995-96': 'http://www.mmb.state.mn.us/cgi-bin/fnts_search.pl?ses_val=1995-96',
	}

	for DIR, base_url in d.iteritems():
	# fetch the page
	res = urllib2.urlopen(base_url)
	# parse the response into an xml tree
	tree = lxml.html.fromstring(res.read())
	# iterate over all <a> tags whose href ends in ".pdf" (case-insensitive)
	for node in tree.xpath('//a[re:test(@href, "\.pdf$", "i")]', namespaces=ns):
	# print the href, joining it to the base_url
	url = urlparse.urljoin(base_url, node.attrib['href'])
	filename = os.path.basename(node.attrib['href'])
	if not os.path.isdir(DIR): # if the directory doesn't exist, create it
	os.makedirs(DIR)
	print('creating new directory:', DIR)
	if not os.path.exists(os.path.join(DIR, filename)): # don't overwrite existing files
	req = urllib2.urlopen(url)
	with open(os.path.join(DIR, filename), 'wb') as fp:
	shutil.copyfileobj(req, fp)
	print('saving', filename)
	else:
	print(filename, 'already exists in', DIR)