Skip to content

Instantly share code, notes, and snippets.

@jpwatts
Created May 26, 2011 21:15
Show Gist options
  • Save jpwatts/994100 to your computer and use it in GitHub Desktop.
Save jpwatts/994100 to your computer and use it in GitHub Desktop.
Scraper for Houston's active incident report
#!/usr/bin/env python
"""Scraper for Houston's active incident report"""
import collections
import csv
import operator
import sys
import dateutil.parser
import httplib2
import pyquery
__all__ = ['CACHE', 'URL', 'ActiveIncident', 'scrape', 'dump']
CACHE = None
URL = u"http://cbtcws.cityofhouston.gov/ActiveIncidents/Combined.aspx"
ActiveIncident = collections.namedtuple('ActiveIncident', 'agency address cross_street key_map call_time incident_type combined_response')
def _scrape():
_response, html = httplib2.Http(CACHE).request(URL)
rows = pyquery.PyQuery(html)('#dgResults tr')[1:]
for r in rows:
cols = [td.text_content().strip().upper() for td in pyquery.PyQuery('td', r)]
if not cols[1]:
continue # Skip records without an address.
try:
cols[4] = dateutil.parser.parse(cols[4])
except ValueError:
continue # Skip records with an invalid call time.
if not cols[5]:
continue # Skip records without an incident type.
cols[6] = cols[6] == 'Y'
yield ActiveIncident(*cols)
def scrape():
"""Return a list of active incidents."""
return sorted(_scrape(), key=operator.attrgetter('call_time'), reverse=True)
def dump(f):
"""Write a CSV file containing a list of active incidents."""
writer = csv.writer(f)
writer.writerow([
u"Agency (FD/PD)",
u"Address",
u"Cross Street",
u"Key Map",
u"Call Time (Opened)",
u"Incident Type",
u"Combined Response (Y/N)"
])
writer.writerows(scrape())
if __name__ == '__main__':
dump(sys.stdout)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment