Last active
December 18, 2015 13:19
-
-
Save chrislkeller/5789120 to your computer and use it in GitHub Desktop.
A simple example of a scraperwiki python scraper for a recent Hacks/Hackers L.A. meetup...
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import logging | |
import re | |
import time, datetime | |
import mechanize | |
from BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup | |
# log everything and send to stderr | |
logging.basicConfig(level=logging.DEBUG) | |
# new instance of mechanize browser | |
mech = mechanize.Browser() | |
# gather each url from target page and write to list | |
def gather_main_content(url_to_scrape): | |
logging.debug('running gather_main_content function') | |
page_scrape = mech.open(url_to_scrape) | |
html_scrape = page_scrape.read() | |
soup_scrape = BeautifulSoup(html_scrape, convertEntities=BeautifulSoup.HTML_ENTITIES) | |
#logging.debug(soup_scrape) | |
data_table = soup_scrape.findAll('table', {'class': 'incident_table'})[1:] | |
for table in data_table: | |
#logging.debug(table) | |
data_rows = table.findAll('tr')[1:] | |
fire_name = extract_data_from_cells(data_rows[0]) | |
county = extract_data_from_cells(data_rows[1]) | |
unique_id = fire_name + '-hacks-hackers' | |
logging.debug(fire_name + ' - ' + county) | |
# Saving data | |
unique_keys = ['id'] | |
fire_narrative = { | |
'id': unique_id, | |
'name': fire_name, | |
'county': county, | |
} | |
scraperwiki.sql.save(unique_keys, fire_narrative) | |
def extract_data_from_cells(row_name): | |
target_cell = row_name.findAll('td') | |
target_data = target_cell[1].text.encode('utf-8') | |
return target_data | |
if __name__ == "__main__": gather_main_content('http://cdfdata.fire.ca.gov/incidents/incidents_current') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment