Skip to content

Instantly share code, notes, and snippets.

@ianjw11
Created December 12, 2016 16:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ianjw11/4c2912c79c47d224b1593b45f7681be5 to your computer and use it in GitHub Desktop.
Save ianjw11/4c2912c79c47d224b1593b45f7681be5 to your computer and use it in GitHub Desktop.
Newspaper Scrapy adapter
import scrapy
from newspaper import Article, Config, Source
from newspaper.source import Feed
import logging
class ScrapyArticle(Article):
def download(self, html=None):
"""Downloads the link's HTML content, don't use if you are batch async
downloading articles
"""
if html is not None:
self.set_html(html)
self.is_downloaded = True
else:
raise Exception("No HTML passed")
class NewsSource(Source):
ARTICLE_CALLBACK = None
def download(self):
pass
@classmethod
def from_scrapy_response(cls, response, config=None):
source = cls(response.url, config=config)
source.html = response.body_as_unicode().strip()
return source
def _feed_callback(self, response):
""" similar to feeds_to_articles, but is called once per category """
feed = response.meta['feed']
logging.info(feed.url)
body = feed.rss = response.body_as_unicode().strip()
urls = self.extractor.get_urls(body, regex=True)
for url in urls:
article = ScrapyArticle(
url=url,
source_url=self.url,
config=self.config
)
req = scrapy.Request(url, callback=self.ARTICLE_CALLBACK)
req.meta['article'] = article
if 'extra_data' in response.meta:
req.meta['extra_data'] = response.meta['extra_data']
yield req
def _get_feeds(self, category):
""" We only have access to one category at a time,
so we must call get_feeds individually for each category"""
urls = self.extractor.get_feed_urls(self.url, [category])
feeds = [Feed(url=url) for url in urls]
for feed in feeds:
req = scrapy.Request(feed.url, callback=self._feed_callback)
req.meta['feed'] = feed
yield req
def _category_callback(self, response):
""" similar to categories_to_articles """
category = response.meta['category']
body = category.html = response.body_as_unicode().strip()
doc = category.doc = self.config.get_parser().fromstring(category.html)
urls = self.extractor.get_urls(body, titles=True)
for tup in urls:
indiv_url = tup[0]
indiv_title = tup[1]
article = ScrapyArticle(
url=indiv_url,
source_url=self.url,
title=indiv_title,
config=self.config
)
req = scrapy.Request(article.url, callback=self.ARTICLE_CALLBACK)
req.meta['article'] = article
if 'extra_data' in response.meta:
req.meta['extra_data'] = response.meta['extra_data']
yield req
for feed_req in self._get_feeds(category):
if 'extra_data' in response.meta:
feed_req.meta['extra_data'] = response.meta['extra_data']
yield feed_req
def get_categories(self):
self.set_categories()
for category in self.categories:
logging.info(category.url)
req = scrapy.Request(category.url, callback=self._category_callback)
req.meta['category'] = category
yield req
def build(self, article_callback):
""" """
self.ARTICLE_CALLBACK = article_callback
self.download()
self.parse()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment