Skip to content

Instantly share code, notes, and snippets.

@Santhin
Last active July 5, 2022 02:51
Show Gist options
  • Save Santhin/bd10b4fee5cdd2cf9bf69c048476d26d to your computer and use it in GitHub Desktop.
Save Santhin/bd10b4fee5cdd2cf9bf69c048476d26d to your computer and use it in GitHub Desktop.
APscheduler with scrapy
#source stack
from scrapy import spiderloader
from scrapy.utils import project
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor, defer
from scrapy.utils.log import configure_logging
import logging
from datetime import datetime
import scrapy
import asyncio
from twisted.internet import asyncioreactor
scrapy.utils.reactor.install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
is_asyncio_reactor_installed = scrapy.utils.reactor.is_asyncio_reactor_installed()
from apscheduler.schedulers.twisted import TwistedScheduler
scheduler = TwistedScheduler(reactor=reactor)
@defer.inlineCallbacks
def crawl():
#configure_logging()
runner = CrawlerRunner()
settings = project.get_project_settings()
spider_loader = spiderloader.SpiderLoader.from_settings(settings)
spiders = spider_loader.list()
classes = [spider_loader.load(name) for name in spiders]
for my_spider, spider_name in zip(classes,spiders):
logging.info("Name of the active spider: "spider_name)
yield runner.crawl(my_spider)
def main():
configure_logging()
scheduler = TwistedScheduler()
scheduler.add_job(crawl, 'interval', seconds=60, next_run_time=datetime.now())
scheduler.start()
reactor.run()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment