public
Created

A simple example of a scraperwiki python scraper for a recent Hacks/Hackers L.A. meetup...

  • Download Gist
fire_scraperwiki.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
#!/usr/bin/env python
 
import scraperwiki
import logging
import re
import time, datetime
import mechanize
from BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
 
# log everything and send to stderr
logging.basicConfig(level=logging.DEBUG)
 
# new instance of mechanize browser
mech = mechanize.Browser()
 
# gather each url from target page and write to list
def gather_main_content(url_to_scrape):
logging.debug('running gather_main_content function')
page_scrape = mech.open(url_to_scrape)
html_scrape = page_scrape.read()
soup_scrape = BeautifulSoup(html_scrape, convertEntities=BeautifulSoup.HTML_ENTITIES)
#logging.debug(soup_scrape)
data_table = soup_scrape.findAll('table', {'class': 'incident_table'})[1:]
for table in data_table:
#logging.debug(table)
data_rows = table.findAll('tr')[1:]
fire_name = extract_data_from_cells(data_rows[0])
county = extract_data_from_cells(data_rows[1])
unique_id = fire_name + '-hacks-hackers'
logging.debug(fire_name + ' - ' + county)
 
# Saving data
unique_keys = ['id']
fire_narrative = {
'id': unique_id,
'name': fire_name,
'county': county,
}
scraperwiki.sql.save(unique_keys, fire_narrative)
 
def extract_data_from_cells(row_name):
target_cell = row_name.findAll('td')
target_data = target_cell[1].text.encode('utf-8')
return target_data
 
if __name__ == "__main__": gather_main_content('http://cdfdata.fire.ca.gov/incidents/incidents_current')

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.