Created
October 15, 2014 04:18
-
-
Save timfeirg/92907f9c0033d42c2548 to your computer and use it in GitHub Desktop.
scrapyd command line program, start scrapyd crawler using a python program
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
scrapyd commandline interface, for my own project use | |
""" | |
import argparse | |
import requests | |
import json | |
import os | |
from scrapy.conf import settings | |
# this only works in scrapy, if you plan to use this API in command line, you | |
# should specify the scrapyd host url or the default localhost is used | |
url = settings['SCRAPYD_URL'] | |
project = settings['BOT_NAME'] | |
jobs_dir = settings['JOBSDIR'] | |
def switch_job(crawler_name=None, force=True): | |
""" | |
kill all jobs and start the given one | |
if force == True, append the given job to queue (rather than kills existing) | |
if crawler_name == None, cancel all jobs | |
""" | |
if force: | |
castrate() | |
if crawler_name: | |
response = start_job(crawler_name) | |
else: | |
response = None | |
return response | |
def list_jobs(): | |
response = requests.get("%slistjobs.json" % url, | |
params={"project": project}) | |
return json.loads(response.text) | |
def alive_jobs(spider_list=None): | |
"""returns a list of job_ids off all or specified spiders""" | |
if isinstance(spider_list, basestring): | |
spider_list = [spider_list] | |
job_list = list_jobs() | |
alive_jobs = job_list['running'] + job_list['pending'] | |
job_list = [] | |
if spider_list: | |
for spider in spider_list: | |
job_list += [job["id"] for job in alive_jobs if job["spider"] == spider] | |
else: | |
job_list = [job["id"] for job in alive_jobs] | |
for job_id in job_list: | |
yield job_id | |
def cancel_job(job_id): | |
payload = {"project": project, "job": job_id} | |
response = requests.post("%scancel.json" % url, data=payload) | |
return json.loads(response.text) | |
def castrate(spider_list=None): | |
"""kill all specify spider type to kill""" | |
if isinstance(spider_list, basestring): | |
spider_list = [spider_list] | |
while [job_id for job_id in alive_jobs(spider_list)]: | |
# while there's still targeted alive targeted spiders | |
for job_id in alive_jobs(spider_list): | |
cancel_job(job_id) | |
return 'all killed' | |
def start_job(crawler_name): | |
""" | |
explaination on the different http POST payload between spiders: | |
* spiders like info_crawler and newly_crawler stores the current job | |
progress in database, and read from it at every start to resume | |
* however rome_crawler doesn't, it'll be too complicated and error prone | |
so that it needs scrapy's own job preservation feature, and that | |
requires setting a JOBDIR | |
""" | |
if [job_id for job_id in alive_jobs(crawler_name)]: | |
return "job already running" | |
if crawler_name == "rome_crawler": | |
payload = { | |
"project": project, | |
"spider": crawler_name, | |
"setting": "JOBDIR=%s/%s" % (jobs_dir, crawler_name) | |
} | |
else: | |
payload = { | |
"project": project, | |
"spider": crawler_name | |
} | |
response = requests.post("%sschedule.json" % url, data=payload) | |
return json.loads(response.text) | |
def clean_url(url): | |
if not url.startswith("http://"): | |
url = "http://" + url | |
if not url.endswith("/"): | |
url += "/" | |
return url | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--host", help="host:port for scrapyd, use localhost if not specified") | |
parser.add_argument("--project", help="project to perform on, default scrapy_crawler") | |
parser.add_argument("--jobsdir", help="job directory, default ~/scrapy_crawler_stuff/jobs_dir") | |
parser.add_argument("--start", help="specify spider to start") | |
parser.add_argument("--kill", help="sepcify spider to kill") | |
args = vars(parser.parse_args()) | |
url = args.get("host") or "http://localhost:6800/" | |
url = clean_url(url) | |
project = args.get("project") or 'scrapy_crawler' | |
jobs_dir = args.get("jobsdir") or '%s/scrapy_crawler_stuff/jobs_dir' % os.environ["HOME"] | |
if args["kill"]: | |
response = castrate(args["kill"]) | |
print(response) | |
elif args["start"]: | |
response = start_job(args["start"]) | |
print(response) | |
else: | |
parser.print_help() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment