Created
June 23, 2013 16:04
-
-
Save abelsonlive/5845517 to your computer and use it in GitHub Desktop.
the scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas | |
import requests | |
import re | |
import urllib | |
import itertools | |
import uuid | |
import json | |
from urlparse import urljoin | |
from BeautifulSoup import BeautifulSoup | |
from datetime import datetime | |
from itertools import groupby | |
from Queue import Queue | |
from threading import Thread | |
import logging | |
master_csv = 'https://docs.google.com/spreadsheet/pub?key=0An9Q5Mkz4lG7dGt6ZTNZLU03cnZRMkQtMkdaZ1lhd1E&output=csv' | |
d = pandas.read_csv(master_csv) | |
time = datetime.now().strftime("%s") | |
data = [] | |
for i in d.index: | |
url = d['site_url'][i] | |
regex = d['site_regex'][i] | |
news_org = d['site_name'][i] | |
soup = BeautifulSoup(requests.get(url).text) | |
links = [a['href'].encode('utf-8') for a in soup.findAll('a',href=True)] | |
links = [re.sub(url, "", l) for l in links] | |
links = [urljoin(url,l) for l in links] | |
articles = [l for l in links if re.match(regex, l)] | |
articles = [a for a,_ in groupby(articles)] | |
for a in articles: | |
data.append({'url':a, 'news_org': news_org, 'time': time }) | |
def threaded(items, func, num_threads=100, max_queue=200): | |
def queue_consumer(): | |
while True: | |
item = queue.get(True) | |
try: | |
func(item) | |
except Exception, e: | |
log.exception(e) | |
queue.task_done() | |
queue = Queue(maxsize=max_queue) | |
for i in range(num_threads): | |
t = Thread(target=queue_consumer) | |
t.daemon = True | |
t.start() | |
for item in items: | |
queue.put(item, True) | |
queue.join() | |
def get_article(d): | |
url = d['url'] | |
print "fetching stuff for %s" % url | |
d['html'] = requests.get(url).text | |
api_url = "http://boilerpipe-web.appspot.com/extract?url=" + urllib.quote_plus(url) + "&extractor=ArticleExtractor&output=json" | |
bp = requests.get(api_url).json | |
if bp['status']=="success": | |
d['title'] = bp['response']['title'] | |
d['text'] = bp['response']['content']s | |
else: | |
d['title'] = None | |
d['text'] = None | |
fname = "data/" + uuid.uuid4().hex + ".json" | |
print "writing %s to file" | |
with open(fname, 'w') as f: | |
json.dump(d, f) | |
if __name__ == '__main__': | |
threaded(data, get_article) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment