Skip to content

Instantly share code, notes, and snippets.

@nyov
Last active December 6, 2020 09:49
Show Gist options
  • Save nyov/9c70e780ea80204559d6da5525228702 to your computer and use it in GitHub Desktop.
Save nyov/9c70e780ea80204559d6da5525228702 to your computer and use it in GitHub Desktop.
Scrapy RSSSpider using feedparser
# -*- coding: utf-8 -*-
import logging
import scrapy
import feedparser
class RSSSpider(scrapy.Spider):
name = "rss"
# Can pass some URLs on the commandline:
# $ scrapy runspider rssspider.py -a 'urls=http://some.url/,https://some.other.url'
def __init__(self, *args, **kwargs):
if kwargs.get('urls'):
urls = kwargs.pop('urls', [])
if urls:
self.start_urls = urls.split(',')
super(RSSSpider, self).__init__(*args, **kwargs)
def start_requests(self):
urls = []
for url in self.start_urls:
urls.append(scrapy.Request(url))
return urls
def parse_feed(self, feed):
""" Parse RSS/Atom feed using feedparser
"""
data = feedparser.parse(feed)
if data.bozo:
logging.error('Bozo feed data. %s: %r',
data.bozo_exception.__class__.__name__,
data.bozo_exception)
if (hasattr(data.bozo_exception, 'getLineNumber') and
hasattr(data.bozo_exception, 'getMessage')):
line = data.bozo_exception.getLineNumber()
logging.error('Line %d: %s', line, data.bozo_exception.getMessage())
segment = feed.split('\n')[line-1]
logging.info('Body segment with error: %r', segment)
# could still try to return data. not necessarily completely broken
return None
return data
def parse(self, response):
# parse downloaded content with feedparser (NOT re-downloading with feedparser)
feed = self.parse_feed(response.body)
if feed:
# grab some feed elements
# - https://pythonhosted.org/feedparser/common-rss-elements.html
# - https://pythonhosted.org/feedparser/common-atom-elements.html
#ns = feed.namespaces
feed_title = feed.feed.title
feed_link = feed.feed.link
feed_desc = feed.feed.description
for entry in feed.entries:
# have content?
content = entry.get('content')
if content:
#content = content[0]
content = content[0]['value']
item = {
# global feed data
'feed_title': feed_title,
'feed_link': feed_link,
'feed_description': feed_desc,
#
# item entry data
'url': response.url,
'link': entry.link,
'title': entry.title,
'description': entry.description,
#'date': entry.published,
#'date': entry.published_parsed,
'date': entry.updated_parsed,
# optional
'content': content,
'type': entry.get('dc_type'),
}
yield item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment