Skip to content

Instantly share code, notes, and snippets.

@ziplokk1
Last active July 7, 2016 14:09
Show Gist options
  • Save ziplokk1/76a3f4d454403a745b0773e6ea3cf3be to your computer and use it in GitHub Desktop.
Save ziplokk1/76a3f4d454403a745b0773e6ea3cf3be to your computer and use it in GitHub Desktop.
"""
This middleware will allow you to specify status codes to retry after a specified wait timeout period.
"""
import time
from scrapy.contrib.downloadermiddleware.retry import RetryMiddleware
from scrapy.utils.response import response_status_message
class PauseOnStatusMiddleware(RetryMiddleware):
"""
Config:
# settings.py
TIMEOUT_STATUS_CODES = [503] # or whatever status codes you'd like to retry after a specified amount of time
RETRY_TIMEOUT = 2.5 # optional. Default is 2.5.
# Set the middleware index to 501 which comes right after the retry middleware.
# This way when a response is received, the PauseOnStatusMiddleware intercepts the response before
# the base retry middleware.
DOWNLOADER_MIDDLEWARES = {
'your_module.middleware.PauseOnStatusMiddleware': 501
}
"""
TIMEOUT = 2.5
def __init__(self, settings):
RetryMiddleware.__init__(self, settings)
self.timeout_status_codes = set(int(x) for x in settings.getlist('TIMEOUT_STATUS_CODES'))
self.TIMEOUT = settings.get('RETRY_TIMEOUT', self.TIMEOUT)
def process_response(self, request, response, spider):
if request.meta.get('dont_retry', False):
return response
if response.status in self.timeout_status_codes:
request.meta['cookiejar'] = request.meta.get('cookiejar', 0) + 1
time.sleep(self.TIMEOUT)
reason = response_status_message(response.status)
return self._retry(request, reason, spider) or response
return response
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment