Skip to content

Instantly share code, notes, and snippets.

@dynamicguy
Created June 4, 2020 13:17
Show Gist options
  • Save dynamicguy/b184deccf8c7275718288bd82d5ec95f to your computer and use it in GitHub Desktop.
Save dynamicguy/b184deccf8c7275718288bd82d5ec95f to your computer and use it in GitHub Desktop.
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from my_project.spiders.deals import DealsSpider
def crawl_job():
"""
Job to start spiders.
Return Deferred, which will execute after crawl has completed.
"""
settings = get_project_settings()
runner = CrawlerRunner(settings)
return runner.crawl(DealsSpider)
def schedule_next_crawl(null, sleep_time):
"""
Schedule the next crawl
"""
reactor.callLater(sleep_time, crawl)
def crawl():
"""
A "recursive" function that schedules a crawl 30 seconds after
each successful crawl.
"""
# crawl_job() returns a Deferred
d = crawl_job()
# call schedule_next_crawl(<scrapy response>, n) after crawl job is complete
d.addCallback(schedule_next_crawl, 30)
d.addErrback(catch_error)
def catch_error(failure):
print(failure.value)
if __name__=="__main__":
crawl()
reactor.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment