Skip to content

Instantly share code, notes, and snippets.

@vibhanshuc
Created December 29, 2016 12:47
Show Gist options
  • Save vibhanshuc/6e3ecc184aa3680eeef2abb8833dc5d3 to your computer and use it in GitHub Desktop.
Save vibhanshuc/6e3ecc184aa3680eeef2abb8833dc5d3 to your computer and use it in GitHub Desktop.
from scrapy import cmdline
from swishpick.pipelines import *
from datetime import datetime
import subprocess
import time
_week_day_relations = {
'Monday': 1,
'Tuesday': 2,
'Wednesday': 3,
'Thursday': 4,
'Friday': 5,
'Saturday': 6,
'Sunday': 7
}
_today_day = datetime.now().strftime("%A")
_day_num = _week_day_relations.get(_today_day)
spiders = SpiderStatus.select().where(SpiderStatus.scheduled_start_time == _day_num)
Next = 0
MaxProcesses = 5
MaxUrls = spiders.count()
Processes = []
def StartNew():
""" Start a new subprocess if there is work to do """
global Next
global Processes
if Next < MaxUrls:
spider = spiders[Next]
spider.spider_name
proc = subprocess.Popen(['scrapy', 'crawl', spider.spider_name, '-s LOG_FILE=/var/log/swishpick/{0}_{1}.log'.format(spider.spider_name, datetime.now().strftime("%Y-%m-%d.%H%M%S"))])
Next += 1
Processes.append(proc)
def CheckRunning():
""" Check any running processes and start new ones if there are spare slots."""
global Processes
global Next
for p in reversed(range(len(Processes))): # Check the processes in reverse order
if Processes[p].poll() is not None: # If the process hasn't finished will return None
del Processes[p] # Remove from list - this is why we needed reverse order
while (len(Processes) < MaxProcesses) and (Next < MaxUrls): # More to do and some spare slots
StartNew()
if __name__ == "__main__":
CheckRunning() # This will start the max processes running
while (len(Processes) > 0): # Some thing still going on.
time.sleep(0.1) # You may wish to change the time for this
CheckRunning()
print ("Done!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment