@classmethod | |
def from_crawler(cls, crawler, *args, **kwargs): | |
"""Register to receive the idle event""" | |
spider = super(SecureSosStateOrUsSpider, cls).from_crawler( | |
crawler, *args, **kwargs | |
) | |
crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle) | |
return spider | |
def spider_idle(self, spider): | |
"""Schedule a simple request in order to return the collected data""" | |
if self.data_submitted: | |
return | |
# This is a hack: I don't yet know how to schedule a request to just | |
# submit data _without_ also triggering a scrape. So I provide a URL | |
# to a simple site that we're going to ignore. | |
null_request = scrapy.Request("http://neverssl.com/", callback=self.submit_data) | |
self.crawler.engine.schedule(null_request, spider) | |
raise scrapy.exceptions.DontCloseSpider | |
def submit_data(self, _): | |
"""Simply return the collection of all the scraped data. Ignore the actual | |
scraped content. I haven't figured out another way to submit the merged | |
results. | |
To be used as a callback when the spider is idle (i.e., has finished scraping.) | |
""" | |
self.data_submitted = True | |
return self.sportsInventory |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment