Skip to content

Instantly share code, notes, and snippets.

@timfeirg
Created October 15, 2014 04:18
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save timfeirg/92907f9c0033d42c2548 to your computer and use it in GitHub Desktop.
Save timfeirg/92907f9c0033d42c2548 to your computer and use it in GitHub Desktop.
scrapyd command line program, start scrapyd crawler using a python program
"""
scrapyd commandline interface, for my own project use
"""
import argparse
import requests
import json
import os
from scrapy.conf import settings
# this only works in scrapy, if you plan to use this API in command line, you
# should specify the scrapyd host url or the default localhost is used
url = settings['SCRAPYD_URL']
project = settings['BOT_NAME']
jobs_dir = settings['JOBSDIR']
def switch_job(crawler_name=None, force=True):
"""
kill all jobs and start the given one
if force == True, append the given job to queue (rather than kills existing)
if crawler_name == None, cancel all jobs
"""
if force:
castrate()
if crawler_name:
response = start_job(crawler_name)
else:
response = None
return response
def list_jobs():
response = requests.get("%slistjobs.json" % url,
params={"project": project})
return json.loads(response.text)
def alive_jobs(spider_list=None):
"""returns a list of job_ids off all or specified spiders"""
if isinstance(spider_list, basestring):
spider_list = [spider_list]
job_list = list_jobs()
alive_jobs = job_list['running'] + job_list['pending']
job_list = []
if spider_list:
for spider in spider_list:
job_list += [job["id"] for job in alive_jobs if job["spider"] == spider]
else:
job_list = [job["id"] for job in alive_jobs]
for job_id in job_list:
yield job_id
def cancel_job(job_id):
payload = {"project": project, "job": job_id}
response = requests.post("%scancel.json" % url, data=payload)
return json.loads(response.text)
def castrate(spider_list=None):
"""kill all specify spider type to kill"""
if isinstance(spider_list, basestring):
spider_list = [spider_list]
while [job_id for job_id in alive_jobs(spider_list)]:
# while there's still targeted alive targeted spiders
for job_id in alive_jobs(spider_list):
cancel_job(job_id)
return 'all killed'
def start_job(crawler_name):
"""
explaination on the different http POST payload between spiders:
* spiders like info_crawler and newly_crawler stores the current job
progress in database, and read from it at every start to resume
* however rome_crawler doesn't, it'll be too complicated and error prone
so that it needs scrapy's own job preservation feature, and that
requires setting a JOBDIR
"""
if [job_id for job_id in alive_jobs(crawler_name)]:
return "job already running"
if crawler_name == "rome_crawler":
payload = {
"project": project,
"spider": crawler_name,
"setting": "JOBDIR=%s/%s" % (jobs_dir, crawler_name)
}
else:
payload = {
"project": project,
"spider": crawler_name
}
response = requests.post("%sschedule.json" % url, data=payload)
return json.loads(response.text)
def clean_url(url):
if not url.startswith("http://"):
url = "http://" + url
if not url.endswith("/"):
url += "/"
return url
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--host", help="host:port for scrapyd, use localhost if not specified")
parser.add_argument("--project", help="project to perform on, default scrapy_crawler")
parser.add_argument("--jobsdir", help="job directory, default ~/scrapy_crawler_stuff/jobs_dir")
parser.add_argument("--start", help="specify spider to start")
parser.add_argument("--kill", help="sepcify spider to kill")
args = vars(parser.parse_args())
url = args.get("host") or "http://localhost:6800/"
url = clean_url(url)
project = args.get("project") or 'scrapy_crawler'
jobs_dir = args.get("jobsdir") or '%s/scrapy_crawler_stuff/jobs_dir' % os.environ["HOME"]
if args["kill"]:
response = castrate(args["kill"])
print(response)
elif args["start"]:
response = start_job(args["start"])
print(response)
else:
parser.print_help()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment