abd1rahmane/README.md

## README.md

      
    Raw
  

              README.md
            
          
    My first shot at fixing this was in tasks.py file below.
But then that just gave a "unhandled error in deferred",
so I went on to use CrawlRunner.
That showed no output at all anymore, and didn't run as expected.
Eventually, I just settled on CELERY_WORKER_MAX_TASKS_PER_CHILD=1=1 in settings.py
Note: CELERY_WORKER_MAX_TASKS_PER_CHILD=1 is for django. Celery without django probably drops the CELERY_ prefix

  
## tasks.py
# Utility class from https://stackoverflow.com/a/22202877/4126114
# because twister.reactor gives error "not restartable"
#
# Modified for celery==4.1.0  Scrapy==1.5.0  billiard==3.5.0.3

from billiard import Process
from scrapy import signals as scrapy_signals
from twisted.internet import reactor
from scrapy.crawler import Crawler
class UrlCrawlerScript(Process):
    def __init__(self, spider):
        Process.__init__(self)
        self.crawler = Crawler(
          spider,
          settings={
            'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
            'ITEM_PIPELINES': {
              # ...
            }
          }
        )
        self.crawler.signals.connect(reactor.stop, signal=scrapy_signals.spider_closed)
        self.spider = spider

    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        reactor.run()


def run_spider(url):
  spider = CrJusticeGovLbSpiderDjango(url)
  crawler = UrlCrawlerScript(spider)

  # the script will block here until the crawling is finished
  crawler.start()
  crawler.join()

## tasks2.py
# as above, but using CrawlRunner as documented at
# https://doc.scrapy.org/en/latest/topics/practices.html

from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor

def run_url():
  spider_settings = {
            'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
            'ITEM_PIPELINES': {
              # no need for the original pipeline since this local one inherits from it
              'bsec_compliance_kyc_import.scrapy_django_pipeline.DjangoPipeline': 400,
            }
          }
  runner = CrawlerRunner(settings=spider_settings)
  spider = CrJusticeGovLbSpiderDjango(scrape_instance=instance, df_in=df_in)
  d = runner.crawl(spider)
  # keep reactor running
  # d.addBoth(lambda _: reactor.stop())


  # the script will block here until the crawling is finished
  if not reactor.running:
    reactor.run()
	# Utility class from https://stackoverflow.com/a/22202877/4126114
	# because twister.reactor gives error "not restartable"
	#
	# Modified for celery==4.1.0 Scrapy==1.5.0 billiard==3.5.0.3

	from billiard import Process
	from scrapy import signals as scrapy_signals
	from twisted.internet import reactor
	from scrapy.crawler import Crawler
	class UrlCrawlerScript(Process):
	def __init__(self, spider):
	Process.__init__(self)
	self.crawler = Crawler(
	spider,
	settings={
	'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
	'ITEM_PIPELINES': {
	# ...
	}
	}
	)
	self.crawler.signals.connect(reactor.stop, signal=scrapy_signals.spider_closed)
	self.spider = spider

	def run(self):
	self.crawler.crawl(self.spider)
	self.crawler.start()
	reactor.run()


	def run_spider(url):
	spider = CrJusticeGovLbSpiderDjango(url)
	crawler = UrlCrawlerScript(spider)

	# the script will block here until the crawling is finished
	crawler.start()
	crawler.join()
	# as above, but using CrawlRunner as documented at
	# https://doc.scrapy.org/en/latest/topics/practices.html

	from scrapy.crawler import CrawlerRunner
	from twisted.internet import reactor

	def run_url():
	spider_settings = {
	'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
	'ITEM_PIPELINES': {
	# no need for the original pipeline since this local one inherits from it
	'bsec_compliance_kyc_import.scrapy_django_pipeline.DjangoPipeline': 400,
	}
	}
	runner = CrawlerRunner(settings=spider_settings)
	spider = CrJusticeGovLbSpiderDjango(scrape_instance=instance, df_in=df_in)
	d = runner.crawl(spider)
	# keep reactor running
	# d.addBoth(lambda _: reactor.stop())


	# the script will block here until the crawling is finished
	if not reactor.running:
	reactor.run()