Created
February 7, 2021 04:06
-
-
Save TeraBytesMemory/697e5e5c48b605a44ddd9a5f73cc0e02 to your computer and use it in GitHub Desktop.
timeout schedule middleware in scrapy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class TimeoutScheduleMiddleware(object): | |
TIMEOUT_SEC = 3600 | |
def __init__(self, crawler): | |
self.crawler = crawler | |
self.scheduler_per_spider = {} | |
@classmethod | |
def from_crawler(cls, crawler): | |
# This method is used by Scrapy to create your spiders. | |
s = cls(crawler) | |
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) | |
return s | |
def spider_opened(self, spider): | |
self.scheduler_per_spider[spider.name] = time.time() | |
def process_request(self, request, spider): | |
elapsed = time.time() - self.scheduler_per_spider.get(spider.name, self.TIMEOUT_SEC) | |
if elapsed > self.TIMEOUT_SEC: | |
self.scheduler_per_spider[spider.name] = time.time() | |
self.crawler.engine.close_spider(spider, 'closespider_time_limit_exceeded') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment