Skip to content

Instantly share code, notes, and snippets.

@abelsonlive
Created June 23, 2013 16:04
Show Gist options
  • Save abelsonlive/5845517 to your computer and use it in GitHub Desktop.
Save abelsonlive/5845517 to your computer and use it in GitHub Desktop.
the scraper
import pandas
import requests
import re
import urllib
import itertools
import uuid
import json
from urlparse import urljoin
from BeautifulSoup import BeautifulSoup
from datetime import datetime
from itertools import groupby
from Queue import Queue
from threading import Thread
import logging
master_csv = 'https://docs.google.com/spreadsheet/pub?key=0An9Q5Mkz4lG7dGt6ZTNZLU03cnZRMkQtMkdaZ1lhd1E&output=csv'
d = pandas.read_csv(master_csv)
time = datetime.now().strftime("%s")
data = []
for i in d.index:
url = d['site_url'][i]
regex = d['site_regex'][i]
news_org = d['site_name'][i]
soup = BeautifulSoup(requests.get(url).text)
links = [a['href'].encode('utf-8') for a in soup.findAll('a',href=True)]
links = [re.sub(url, "", l) for l in links]
links = [urljoin(url,l) for l in links]
articles = [l for l in links if re.match(regex, l)]
articles = [a for a,_ in groupby(articles)]
for a in articles:
data.append({'url':a, 'news_org': news_org, 'time': time })
def threaded(items, func, num_threads=100, max_queue=200):
def queue_consumer():
while True:
item = queue.get(True)
try:
func(item)
except Exception, e:
log.exception(e)
queue.task_done()
queue = Queue(maxsize=max_queue)
for i in range(num_threads):
t = Thread(target=queue_consumer)
t.daemon = True
t.start()
for item in items:
queue.put(item, True)
queue.join()
def get_article(d):
url = d['url']
print "fetching stuff for %s" % url
d['html'] = requests.get(url).text
api_url = "http://boilerpipe-web.appspot.com/extract?url=" + urllib.quote_plus(url) + "&extractor=ArticleExtractor&output=json"
bp = requests.get(api_url).json
if bp['status']=="success":
d['title'] = bp['response']['title']
d['text'] = bp['response']['content']s
else:
d['title'] = None
d['text'] = None
fname = "data/" + uuid.uuid4().hex + ".json"
print "writing %s to file"
with open(fname, 'w') as f:
json.dump(d, f)
if __name__ == '__main__':
threaded(data, get_article)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment