jpwatts/scraper.py

## scraper.py
#!/usr/bin/env python


"""Scraper for Houston's active incident report"""


import collections
import csv
import operator
import sys

import dateutil.parser
import httplib2
import pyquery


__all__ = ['CACHE', 'URL', 'ActiveIncident', 'scrape', 'dump']


CACHE = None
URL = u"http://cbtcws.cityofhouston.gov/ActiveIncidents/Combined.aspx"


ActiveIncident = collections.namedtuple('ActiveIncident', 'agency address cross_street key_map call_time incident_type combined_response')


def _scrape():
    _response, html = httplib2.Http(CACHE).request(URL)
    rows = pyquery.PyQuery(html)('#dgResults tr')[1:]
    for r in rows:
        cols = [td.text_content().strip().upper() for td in pyquery.PyQuery('td', r)]
        if not cols[1]:
            continue # Skip records without an address.
        try:
            cols[4] = dateutil.parser.parse(cols[4])
        except ValueError:
            continue # Skip records with an invalid call time.
        if not cols[5]:
            continue # Skip records without an incident type.
        cols[6] = cols[6] == 'Y'
        yield ActiveIncident(*cols)


def scrape():
    """Return a list of active incidents."""
    return sorted(_scrape(), key=operator.attrgetter('call_time'), reverse=True)


def dump(f):
    """Write a CSV file containing a list of active incidents."""
    writer = csv.writer(f)
    writer.writerow([
        u"Agency (FD/PD)",
        u"Address",
        u"Cross Street",
        u"Key Map",
        u"Call Time (Opened)",
        u"Incident Type",
        u"Combined Response (Y/N)"
    ])
    writer.writerows(scrape())


if __name__ == '__main__':
    dump(sys.stdout)
	#!/usr/bin/env python


	"""Scraper for Houston's active incident report"""


	import collections
	import csv
	import operator
	import sys

	import dateutil.parser
	import httplib2
	import pyquery


	__all__ = ['CACHE', 'URL', 'ActiveIncident', 'scrape', 'dump']


	CACHE = None
	URL = u"http://cbtcws.cityofhouston.gov/ActiveIncidents/Combined.aspx"


	ActiveIncident = collections.namedtuple('ActiveIncident', 'agency address cross_street key_map call_time incident_type combined_response')


	def _scrape():
	_response, html = httplib2.Http(CACHE).request(URL)
	rows = pyquery.PyQuery(html)('#dgResults tr')[1:]
	for r in rows:
	cols = [td.text_content().strip().upper() for td in pyquery.PyQuery('td', r)]
	if not cols[1]:
	continue # Skip records without an address.
	try:
	cols[4] = dateutil.parser.parse(cols[4])
	except ValueError:
	continue # Skip records with an invalid call time.
	if not cols[5]:
	continue # Skip records without an incident type.
	cols[6] = cols[6] == 'Y'
	yield ActiveIncident(*cols)


	def scrape():
	"""Return a list of active incidents."""
	return sorted(_scrape(), key=operator.attrgetter('call_time'), reverse=True)


	def dump(f):
	"""Write a CSV file containing a list of active incidents."""
	writer = csv.writer(f)
	writer.writerow([
	u"Agency (FD/PD)",
	u"Address",
	u"Cross Street",
	u"Key Map",
	u"Call Time (Opened)",
	u"Incident Type",
	u"Combined Response (Y/N)"
	])
	writer.writerows(scrape())


	if __name__ == '__main__':
	dump(sys.stdout)