Skip to content

Instantly share code, notes, and snippets.

@chrislkeller
Last active December 18, 2015 13:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chrislkeller/5789120 to your computer and use it in GitHub Desktop.
Save chrislkeller/5789120 to your computer and use it in GitHub Desktop.
A simple example of a scraperwiki python scraper for a recent Hacks/Hackers L.A. meetup...
#!/usr/bin/env python
import logging
import re
import time, datetime
import mechanize
from BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
# log everything and send to stderr
logging.basicConfig(level=logging.DEBUG)
# new instance of mechanize browser
mech = mechanize.Browser()
# gather each url from target page and write to list
def gather_main_content(url_to_scrape):
logging.debug('running gather_main_content function')
page_scrape = mech.open(url_to_scrape)
html_scrape = page_scrape.read()
soup_scrape = BeautifulSoup(html_scrape, convertEntities=BeautifulSoup.HTML_ENTITIES)
#logging.debug(soup_scrape)
data_table = soup_scrape.findAll('table', {'class': 'incident_table'})[1:]
for table in data_table:
#logging.debug(table)
data_rows = table.findAll('tr')[1:]
fire_name = extract_data_from_cells(data_rows[0])
county = extract_data_from_cells(data_rows[1])
unique_id = fire_name + '-hacks-hackers'
logging.debug(fire_name + ' - ' + county)
# Saving data
unique_keys = ['id']
fire_narrative = {
'id': unique_id,
'name': fire_name,
'county': county,
}
scraperwiki.sql.save(unique_keys, fire_narrative)
def extract_data_from_cells(row_name):
target_cell = row_name.findAll('td')
target_data = target_cell[1].text.encode('utf-8')
return target_data
if __name__ == "__main__": gather_main_content('http://cdfdata.fire.ca.gov/incidents/incidents_current')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment