abelsonlive/get_articles.py

## get_articles.py
import pandas
import requests
import re
import urllib
import itertools
import uuid
import json
from urlparse import urljoin
from BeautifulSoup import BeautifulSoup
from datetime import datetime
from itertools import groupby
from Queue import Queue
from threading import Thread
import logging

master_csv = 'https://docs.google.com/spreadsheet/pub?key=0An9Q5Mkz4lG7dGt6ZTNZLU03cnZRMkQtMkdaZ1lhd1E&output=csv'
d = pandas.read_csv(master_csv)
time = datetime.now().strftime("%s")
data = []
for i in d.index:
  url = d['site_url'][i]
  regex = d['site_regex'][i]
  news_org = d['site_name'][i]
  soup = BeautifulSoup(requests.get(url).text)
  links = [a['href'].encode('utf-8') for a in soup.findAll('a',href=True)]
  links = [re.sub(url, "", l) for l in links]
  links = [urljoin(url,l) for l in links]
  articles = [l for l in links if re.match(regex, l)]
  articles = [a for a,_ in groupby(articles)]
  for a in articles:
    data.append({'url':a, 'news_org': news_org, 'time': time })

def threaded(items, func, num_threads=100, max_queue=200):
  def queue_consumer():
      while True:
          item = queue.get(True)
          try:
              func(item)
          except Exception, e:
              log.exception(e)
          queue.task_done()

  queue = Queue(maxsize=max_queue)

  for i in range(num_threads):
      t = Thread(target=queue_consumer)
      t.daemon = True
      t.start()

  for item in items:
      queue.put(item, True)
  queue.join()

def get_article(d):
  url = d['url']
  print "fetching stuff for %s" % url
  d['html'] = requests.get(url).text
  api_url = "http://boilerpipe-web.appspot.com/extract?url=" + urllib.quote_plus(url) + "&extractor=ArticleExtractor&output=json"
  bp = requests.get(api_url).json
  if bp['status']=="success":
    d['title'] = bp['response']['title']
    d['text'] = bp['response']['content']s
  else:
    d['title'] = None
    d['text'] = None

  fname = "data/" + uuid.uuid4().hex + ".json"
  print "writing %s to file"
  with open(fname, 'w') as f:
    json.dump(d, f)

if __name__ == '__main__':
  threaded(data, get_article)
	import pandas
	import requests
	import re
	import urllib
	import itertools
	import uuid
	import json
	from urlparse import urljoin
	from BeautifulSoup import BeautifulSoup
	from datetime import datetime
	from itertools import groupby
	from Queue import Queue
	from threading import Thread
	import logging

	master_csv = 'https://docs.google.com/spreadsheet/pub?key=0An9Q5Mkz4lG7dGt6ZTNZLU03cnZRMkQtMkdaZ1lhd1E&output=csv'
	d = pandas.read_csv(master_csv)
	time = datetime.now().strftime("%s")
	data = []
	for i in d.index:
	url = d['site_url'][i]
	regex = d['site_regex'][i]
	news_org = d['site_name'][i]
	soup = BeautifulSoup(requests.get(url).text)
	links = [a['href'].encode('utf-8') for a in soup.findAll('a',href=True)]
	links = [re.sub(url, "", l) for l in links]
	links = [urljoin(url,l) for l in links]
	articles = [l for l in links if re.match(regex, l)]
	articles = [a for a,_ in groupby(articles)]
	for a in articles:
	data.append({'url':a, 'news_org': news_org, 'time': time })

	def threaded(items, func, num_threads=100, max_queue=200):
	def queue_consumer():
	while True:
	item = queue.get(True)
	try:
	func(item)
	except Exception, e:
	log.exception(e)
	queue.task_done()

	queue = Queue(maxsize=max_queue)

	for i in range(num_threads):
	t = Thread(target=queue_consumer)
	t.daemon = True
	t.start()

	for item in items:
	queue.put(item, True)
	queue.join()

	def get_article(d):
	url = d['url']
	print "fetching stuff for %s" % url
	d['html'] = requests.get(url).text
	api_url = "http://boilerpipe-web.appspot.com/extract?url=" + urllib.quote_plus(url) + "&extractor=ArticleExtractor&output=json"
	bp = requests.get(api_url).json
	if bp['status']=="success":
	d['title'] = bp['response']['title']
	d['text'] = bp['response']['content']s
	else:
	d['title'] = None
	d['text'] = None

	fname = "data/" + uuid.uuid4().hex + ".json"
	print "writing %s to file"
	with open(fname, 'w') as f:
	json.dump(d, f)

	if __name__ == '__main__':
	threaded(data, get_article)