Skip to content

Instantly share code, notes, and snippets.

@TeraBytesMemory
Created February 7, 2021 04:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TeraBytesMemory/697e5e5c48b605a44ddd9a5f73cc0e02 to your computer and use it in GitHub Desktop.
Save TeraBytesMemory/697e5e5c48b605a44ddd9a5f73cc0e02 to your computer and use it in GitHub Desktop.
timeout schedule middleware in scrapy
class TimeoutScheduleMiddleware(object):
TIMEOUT_SEC = 3600
def __init__(self, crawler):
self.crawler = crawler
self.scheduler_per_spider = {}
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls(crawler)
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def spider_opened(self, spider):
self.scheduler_per_spider[spider.name] = time.time()
def process_request(self, request, spider):
elapsed = time.time() - self.scheduler_per_spider.get(spider.name, self.TIMEOUT_SEC)
if elapsed > self.TIMEOUT_SEC:
self.scheduler_per_spider[spider.name] = time.time()
self.crawler.engine.close_spider(spider, 'closespider_time_limit_exceeded')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment