Created
March 22, 2012 01:33
-
-
Save decause/2155040 to your computer and use it in GitHub Desktop.
Silly Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
import re | |
from BeautifulSoup import BeautifulSoup | |
""" | |
User inputs URL | |
User inputs email | |
User inputs keyword | |
User provides Frequency? | |
Scrape URL, all of it | |
Find keywords | |
save v1 | |
When freq = x | |
Scrape URL | |
Find keywords | |
save v2 | |
diff v1 v2 | |
send message with the diff | |
""" | |
# User inputs URL | |
page = urllib2.urlopen("http://labor.ny.gov/app/warn/") | |
# Scrape URL, all of it | |
soup = BeautifulSoup(page) | |
# Find keywords | |
warns_nyc = soup.findAll(text=re.compile("New York City")) | |
for warn in warns_nyc: | |
print warn |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment