zstumgoren/scraper.py

## scraper.py
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import urllib2
import csv
import re

# NOTE: using live URL sted of locally cached file
URL = "http://cpms.dfa.state.nm.us/doShowAppropriations.aspx?pid=10-1100"
page = urllib2.urlopen(URL)
html = page.read()
# NOTE: Python's native html parser choked on this html on my machine,
# so had to resort to lxml
soup = BeautifulSoup(html, "lxml")

id = soup.find_all(text = re.compile("Appropriation ID:"))


table = soup.find('table', border=6)
data = {}
for row in table.findAll('tr')[2:]:
    # Note, using a naive strategy of grabbing 2nd and 4th cells of each row
    # This may require more careful handling if the position is inconsistent across table rows
    cells = row.findAll('td')
    key = cells[1].text.strip()
    value = cells[3].text.strip()
    data[key] = value
	# -- coding: utf-8 --
	from bs4 import BeautifulSoup
	import urllib2
	import csv
	import re

	# NOTE: using live URL sted of locally cached file
	URL = "http://cpms.dfa.state.nm.us/doShowAppropriations.aspx?pid=10-1100"
	page = urllib2.urlopen(URL)
	html = page.read()
	# NOTE: Python's native html parser choked on this html on my machine,
	# so had to resort to lxml
	soup = BeautifulSoup(html, "lxml")

	id = soup.find_all(text = re.compile("Appropriation ID:"))


	table = soup.find('table', border=6)
	data = {}
	for row in table.findAll('tr')[2:]:
	# Note, using a naive strategy of grabbing 2nd and 4th cells of each row
	# This may require more careful handling if the position is inconsistent across table rows
	cells = row.findAll('td')
	key = cells[1].text.strip()
	value = cells[3].text.strip()
	data[key] = value