Skip to content

Instantly share code, notes, and snippets.

@dogweather
Last active March 16, 2019 23:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dogweather/f6ec262198e4516ed380292cd2c8e569 to your computer and use it in GitHub Desktop.
Save dogweather/f6ec262198e4516ed380292cd2c8e569 to your computer and use it in GitHub Desktop.
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
"""Register to receive the idle event"""
spider = super(SecureSosStateOrUsSpider, cls).from_crawler(
crawler, *args, **kwargs
)
crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle)
return spider
def spider_idle(self, spider):
"""Schedule a simple request in order to return the collected data"""
if self.data_submitted:
return
# This is a hack: I don't yet know how to schedule a request to just
# submit data _without_ also triggering a scrape. So I provide a URL
# to a simple site that we're going to ignore.
null_request = scrapy.Request("http://neverssl.com/", callback=self.submit_data)
self.crawler.engine.schedule(null_request, spider)
raise scrapy.exceptions.DontCloseSpider
def submit_data(self, _):
"""Simply return the collection of all the scraped data. Ignore the actual
scraped content. I haven't figured out another way to submit the merged
results.
To be used as a callback when the spider is idle (i.e., has finished scraping.)
"""
self.data_submitted = True
return self.sportsInventory
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment