Skip to content

Instantly share code, notes, and snippets.

@ebrensi
Last active June 10, 2016 01:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ebrensi/f81f1e32bf8f4d5ceed3c0fbad39df7a to your computer and use it in GitHub Desktop.
Save ebrensi/f81f1e32bf8f4d5ceed3c0fbad39df7a to your computer and use it in GitHub Desktop.
Scrapy extension to keep Heroku awake when running scrapyd
import logging
from twisted.internet import task
from twisted.web.client import getPage
from scrapy import signals
from scrapy.exceptions import NotConfigured
logger = logging.getLogger(__name__)
class KeepHerokuAwake(object):
def __init__(self, crawler):
self.ping_interval = crawler.settings.getint(
'HEROKU_PING_INTERVAL_MINUTES') * 60
if not self.ping_interval:
raise NotConfigured
self.crawler = crawler
self.heroku_scrapyd_url = crawler.settings.get('HEROKU_SCRAPYD_URL')
crawler.signals.connect(self.engine_started,
signal=signals.engine_started)
crawler.signals.connect(self.engine_stopped,
signal=signals.engine_stopped)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def engine_started(self):
self.tasks = []
tsk = task.LoopingCall(self.ping_scrapyd)
self.tasks.append(tsk)
tsk.start(self.ping_interval, now=False)
logger.info("Set to ping %s every %s minutes.",
self.heroku_scrapyd_url,
self.ping_interval / 60)
def engine_stopped(self):
for tsk in self.tasks:
if tsk.running:
tsk.stop()
def ping_scrapyd(self):
logger.info("pinging scrapyd at %s", self.heroku_scrapyd_url)
d = getPage(self.heroku_scrapyd_url)
d.addCallback(self.ping_callback)
return d
def ping_callback(self, response_string):
logger.info("ping confirmed")
@ebrensi
Copy link
Author

ebrensi commented Jun 10, 2016

This extension sends a http request to a URL of your choice, at an interval of your choice while a Scrapy crawler is running.

In order to use it you need to do a few things. Let's assume your project is called project.

  1. Put extensions.py in your root project folder.

  2. Make Scrapy recognize the extension with

    EXTENSIONS = {
        'project.extensions.KeepHerokuAwake': 500
    }
    
  3. Add these settings:

    HEROKU_PING_INTERVAL_MINUTES = 15
    HEROKU_SCRAPYD_URL = "http://myScrapydapp.herokuapp.com"      # (or whatever)
    

You can disable the extension by setting HEROKU_PING_INTERVAL_MINUTES = 0.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment