Skip to content

Instantly share code, notes, and snippets.

@zstumgoren
Created May 5, 2015 23:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zstumgoren/2616924a91811e19ed7d to your computer and use it in GitHub Desktop.
Save zstumgoren/2616924a91811e19ed7d to your computer and use it in GitHub Desktop.
Tweaks to scraper code for Fish
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import urllib2
import csv
import re
# NOTE: using live URL sted of locally cached file
URL = "http://cpms.dfa.state.nm.us/doShowAppropriations.aspx?pid=10-1100"
page = urllib2.urlopen(URL)
html = page.read()
# NOTE: Python's native html parser choked on this html on my machine,
# so had to resort to lxml
soup = BeautifulSoup(html, "lxml")
id = soup.find_all(text = re.compile("Appropriation ID:"))
table = soup.find('table', border=6)
data = {}
for row in table.findAll('tr')[2:]:
# Note, using a naive strategy of grabbing 2nd and 4th cells of each row
# This may require more careful handling if the position is inconsistent across table rows
cells = row.findAll('td')
key = cells[1].text.strip()
value = cells[3].text.strip()
data[key] = value
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment