Last active
December 9, 2016 16:05
-
-
Save stav/5cf3163e0d77275870bf to your computer and use it in GitHub Desktop.
Scrapy sitemap generator pipeline
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" pipelines.py """ | |
import collections | |
import scrapy | |
import scrapy.contrib.exporter | |
import myproject | |
class SitemapPipeline(object): | |
""" | |
Sitemap builder | |
Wait until all items have been scraped and then build a complete sitemap of | |
all URLs and add it to the Items. | |
""" | |
items = [] | |
def __init__(self): | |
self.exporter = scrapy.contrib.exporter.PythonItemExporter( | |
fields_to_export=('url', 'depth', 'referer')) | |
@classmethod | |
def from_crawler(cls, crawler): | |
pipeline = cls() | |
pipeline.crawler = crawler | |
crawler.signals.connect( | |
pipeline._spider_idle_handler, scrapy.signals.spider_idle) | |
return pipeline | |
def _spider_idle_handler(self, spider): | |
self.crawler.signals.disconnect( | |
self._spider_idle_handler, scrapy.signals.spider_idle) | |
self._process_tree(spider) | |
def _process_tree(self, spider): | |
sitemap = self._get_sitemap() | |
self._add_sitemap_item(sitemap, spider) | |
def _get_sitemap(self): | |
sitemap = [] | |
def add_nodes(depth, referers): | |
""" Recursive """ | |
_depth = depth + 1 | |
for referer in referers: | |
referer_key = '{}{}{}'.format(_depth, '-'*_depth, referer) | |
sitemap_key = '{}{}{}'.format(_depth-1, '-'*(_depth-1), referer) | |
sitemap.append(sitemap_key) | |
add_nodes(_depth, referers_map[referer_key]) | |
max_depth = 0 | |
root_nodes = [] | |
for d in self.items: | |
depth = d['depth'] | |
if depth > max_depth: | |
max_depth = depth | |
if depth == 1: | |
root_nodes.append(d['url']) | |
referers_map = collections.defaultdict(list) | |
for d in self.items: | |
if 'referer' in d: | |
referer_key = '{}{}{}'.format(d['depth'], '-'*d['depth'], d['referer']) | |
referers_map[referer_key].append(d['url']) | |
add_nodes(1, root_nodes) | |
return sitemap | |
def _add_sitemap_item(self, sitemap, spider): | |
request = response = None | |
scraper = self.crawler.engine.scraper | |
item = myproject.items.SitemapItem(dict(sitemap=sitemap)) | |
scraper._process_spidermw_output(item, request, response, spider) | |
def process_item(self, item, spider): | |
self.items.append(self.exporter.export_item(item)) | |
return item |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I get the following error...