chrislkeller/fire_scraper.py

## fire_scraper.py
#!/usr/bin/env python

import logging
import re
import time, datetime
import mechanize
from BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup

# log everything and send to stderr
logging.basicConfig(level=logging.DEBUG)

# new instance of mechanize browser
mech = mechanize.Browser()

# gather each url from target page and write to list
def gather_main_content(url_to_scrape):
    logging.debug('running gather_main_content function')
    page_scrape = mech.open(url_to_scrape)
    html_scrape = page_scrape.read()
    soup_scrape = BeautifulSoup(html_scrape, convertEntities=BeautifulSoup.HTML_ENTITIES)
    #logging.debug(soup_scrape)
    data_table = soup_scrape.findAll('table', {'class': 'incident_table'})[1:]
    for table in data_table:
        #logging.debug(table)
        data_rows = table.findAll('tr')[1:]
        fire_name = extract_data_from_cells(data_rows[0])
        county = extract_data_from_cells(data_rows[1])
        unique_id = fire_name + '-hacks-hackers'
        logging.debug(fire_name + ' - ' + county)

        # Saving data
        unique_keys = ['id']
        fire_narrative = {
            'id': unique_id,
            'name': fire_name,
            'county': county,
        }

        scraperwiki.sql.save(unique_keys, fire_narrative)

def extract_data_from_cells(row_name):
    target_cell = row_name.findAll('td')
    target_data = target_cell[1].text.encode('utf-8')
    return target_data

if __name__ == "__main__": gather_main_content('http://cdfdata.fire.ca.gov/incidents/incidents_current')
	#!/usr/bin/env python

	import logging
	import re
	import time, datetime
	import mechanize
	from BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup

	# log everything and send to stderr
	logging.basicConfig(level=logging.DEBUG)

	# new instance of mechanize browser
	mech = mechanize.Browser()

	# gather each url from target page and write to list
	def gather_main_content(url_to_scrape):
	logging.debug('running gather_main_content function')
	page_scrape = mech.open(url_to_scrape)
	html_scrape = page_scrape.read()
	soup_scrape = BeautifulSoup(html_scrape, convertEntities=BeautifulSoup.HTML_ENTITIES)
	#logging.debug(soup_scrape)
	data_table = soup_scrape.findAll('table', {'class': 'incident_table'})[1:]
	for table in data_table:
	#logging.debug(table)
	data_rows = table.findAll('tr')[1:]
	fire_name = extract_data_from_cells(data_rows[0])
	county = extract_data_from_cells(data_rows[1])
	unique_id = fire_name + '-hacks-hackers'
	logging.debug(fire_name + ' - ' + county)

	# Saving data
	unique_keys = ['id']
	fire_narrative = {
	'id': unique_id,
	'name': fire_name,
	'county': county,
	}

	scraperwiki.sql.save(unique_keys, fire_narrative)

	def extract_data_from_cells(row_name):
	target_cell = row_name.findAll('td')
	target_data = target_cell[1].text.encode('utf-8')
	return target_data

	if __name__ == "__main__": gather_main_content('http://cdfdata.fire.ca.gov/incidents/incidents_current')