Skip to content

Instantly share code, notes, and snippets.

@jalperin
Created June 26, 2014 20:45
Show Gist options
  • Save jalperin/2620c7c5a80a3e21a5e5 to your computer and use it in GitHub Desktop.
Save jalperin/2620c7c5a80a3e21a5e5 to your computer and use it in GitHub Desktop.
Fetch all works for a CrossRef DOI prefix from the CrossRef API
"""get_works.py: Fetch all the works from CrossRef API by DOI prefix."""
__author__ = "Juan Pablo Alperin (@juancommander)"
import re
import urllib, urllib2
import simplejson as json
from time import sleep
import datetime
from optparse import OptionParser
parser = OptionParser()
parser.add_option("-f", "--from", dest="f", help="specify the start date YYYY-DD")
parser.add_option("-D", "--dir", dest="out_dir", default=".", help="specify the directory for output")
(options, args) = parser.parse_args()
CROSSREF_PREFIX_API = 'http://api.crossref.org/prefixes/%s/works'
prefix = args[0]
if not re.match('^10\..*', prefix):
print "Invalid prefix: %s" % prefix
exit(1)
start_date = options.f
if start_date and not re.match('^\d\d\d\d\-.*', start_date):
print "Invalid date, use YYYY-DD: %s" % start_date
exit(1)
out_dir = options.out_dir.strip('/')
tries = 0
offset = 0
rows = 50
url = CROSSREF_PREFIX_API % prefix
data = {}
if start_date:
data['filter'] = "from-pub-date:%s" % start_date
lines = []
while True:
try:
data['rows'] = rows
data['offset'] = offset
content = json.load(urllib2.urlopen(url + '?' + urllib.urlencode(data)))
total = int(content['message']['total-results'])
if (offset >= total):
break
offset += rows
items = content['message']['items']
for item in items:
doi = item['DOI']
try:
title = item['title'][0]
except IndexError:
title = 'untitled'
# grabbing only 3 pieces of metadata for printing DOI, pub_date, and title
date = "-".join([str(x).zfill(2) for x in item['indexed']['date-parts'][0]])
lines.append(' '.join([doi, date, title, '\n']))
except Exception, e:
print e
# 3 exceptions in a row and we give up
if tries > 3:
print "failed to fetch URL after 3 tries"
break
tries += 1
# just pause and the loop will try again
sleep(3)
pass
# this prints it out in the format DOI pub_date title
if len(lines):
with open('%s/%s_%s.txt' % (out_dir, prefix, datetime.datetime.now().strftime("%Y-%m-%d-%H%M%S.%f")), 'w') as ofile:
ofile.writelines([line.encode('utf8', 'ignore') for line in lines])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment