Skip to content

Instantly share code, notes, and snippets.

@stav
Last active December 15, 2015 23:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stav/5337954 to your computer and use it in GitHub Desktop.
Save stav/5337954 to your computer and use it in GitHub Desktop.
Scrapinghub API Job Log Sorter
import sys
import json
import argparse
from os.path import exists
from pprint import pprint
from urllib import urlencode, urlretrieve
from urllib2 import urlopen
from urlparse import urlsplit, parse_qs
# curl -u USER:PASS "http://panel.scrapinghub.com/api/jobs/list.json?project=999&spider=somespider&state=finished&count=-10"
apikey = '<privatekey>'
# declare command-line argument parser
command_line = argparse.ArgumentParser(prog=sys.argv[0], description='Sorts a JSON logfile.')
command_line.add_argument(dest='url', metavar='URL', help='the url for the log file')
options = command_line.parse_args(sys.argv[1:])
json_url = options.url
print 'grabbing the log at %s' % json_url
# json_url = 'http://panel.scrapinghub.com/api/log.json?project=999&job=5ga04d56ffddbd7er700081c&apikey=<privatekey>'
# SplitResult(
# scheme='http',
# netloc='panel.scrapinghub.com',
# path='/api/log.json',
# query='project=999&job=5ga04d56ffddbd7er700081c&apikey=<privatekey>',
# fragment='')
query_dict = parse_qs(urlsplit(json_url).query)
# {'project': ['999'],
# 'job': ['5ga04d56ffddbd7er700081c'],
# 'apikey': ['<privatekey>']}
job = query_dict['job'][0]
print 'job %s' % job
project = query_dict['project'][0]
print 'project %s' % project
url = 'http://panel.scrapinghub.com/api/jobs/list.json?' +\
urlencode(dict(
apikey=apikey,
project=project,
job=job,
))
print 'getting api info from %s' % url
rp = urlopen(url)
json_str = rp.read()
json_data = json.loads(json_str)
pprint(json_data)
jobs = json_data.get('jobs', [])
spider = jobs[0][1].get('spider', 'default')
print 'spider %s' % spider
json_file = '%s.%s.json' % (spider, job)
log_file = '%s.%s.log' % (spider, job)
if not exists(json_file):
print 'retrieving %s...' % json_url
print 'got', urlretrieve(json_url, json_file)
log = json.load(open(json_file))
log.sort(key=lambda x: x['id'])
fp = open(log_file, 'wb')
for entry in log:
line = u'%s %s %s\n' % (entry['time'], entry['logLevel'], entry['message'])
fp.write(line.encode('utf-8'))
fp.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment