jalperin/get_works.py

## get_works.py
"""get_works.py: Fetch all the works from CrossRef API by DOI prefix."""

__author__      = "Juan Pablo Alperin (@juancommander)"

import re
import urllib, urllib2
import simplejson as json

from time import sleep
import datetime

from optparse import OptionParser

parser = OptionParser()
parser.add_option("-f", "--from", dest="f", help="specify the start date YYYY-DD")
parser.add_option("-D", "--dir", dest="out_dir", default=".", help="specify the directory for output")

(options, args) = parser.parse_args()

CROSSREF_PREFIX_API = 'http://api.crossref.org/prefixes/%s/works'

prefix = args[0]
if not re.match('^10\..*', prefix):
    print "Invalid prefix: %s" % prefix
    exit(1)

start_date = options.f

if start_date and not re.match('^\d\d\d\d\-.*', start_date):
    print "Invalid date, use YYYY-DD: %s" % start_date
    exit(1)

out_dir = options.out_dir.strip('/')

tries = 0
offset = 0
rows = 50

url = CROSSREF_PREFIX_API % prefix
data = {}

if start_date:
    data['filter'] = "from-pub-date:%s" % start_date

lines = []

while True:
    try:
        data['rows'] = rows
        data['offset'] = offset

        content = json.load(urllib2.urlopen(url + '?' + urllib.urlencode(data)))
        total = int(content['message']['total-results'])

        if (offset >= total):
            break

        offset += rows

        items = content['message']['items']

        for item in items:
            doi = item['DOI']
            try:
                title = item['title'][0]
            except IndexError:
                title = 'untitled'

            # grabbing only 3 pieces of metadata for printing DOI, pub_date, and title
            date = "-".join([str(x).zfill(2) for x in item['indexed']['date-parts'][0]])
            lines.append(' '.join([doi, date, title, '\n']))

    except Exception, e:
        print e
        # 3 exceptions in a row and we give up
        if tries > 3:
            print "failed to fetch URL after 3 tries"
            break

        tries += 1

        # just pause and the loop will try again
        sleep(3)
        pass


# this prints it out in the format DOI pub_date title
if len(lines):
    with open('%s/%s_%s.txt' % (out_dir, prefix, datetime.datetime.now().strftime("%Y-%m-%d-%H%M%S.%f")), 'w') as ofile:
        ofile.writelines([line.encode('utf8', 'ignore') for line in lines])
	"""get_works.py: Fetch all the works from CrossRef API by DOI prefix."""

	__author__ = "Juan Pablo Alperin (@juancommander)"

	import re
	import urllib, urllib2
	import simplejson as json

	from time import sleep
	import datetime

	from optparse import OptionParser

	parser = OptionParser()
	parser.add_option("-f", "--from", dest="f", help="specify the start date YYYY-DD")
	parser.add_option("-D", "--dir", dest="out_dir", default=".", help="specify the directory for output")

	(options, args) = parser.parse_args()

	CROSSREF_PREFIX_API = 'http://api.crossref.org/prefixes/%s/works'

	prefix = args[0]
	if not re.match('^10\..*', prefix):
	print "Invalid prefix: %s" % prefix
	exit(1)

	start_date = options.f

	if start_date and not re.match('^\d\d\d\d\-.*', start_date):
	print "Invalid date, use YYYY-DD: %s" % start_date
	exit(1)

	out_dir = options.out_dir.strip('/')

	tries = 0
	offset = 0
	rows = 50

	url = CROSSREF_PREFIX_API % prefix
	data = {}

	if start_date:
	data['filter'] = "from-pub-date:%s" % start_date

	lines = []

	while True:
	try:
	data['rows'] = rows
	data['offset'] = offset

	content = json.load(urllib2.urlopen(url + '?' + urllib.urlencode(data)))
	total = int(content['message']['total-results'])

	if (offset >= total):
	break

	offset += rows

	items = content['message']['items']

	for item in items:
	doi = item['DOI']
	try:
	title = item['title'][0]
	except IndexError:
	title = 'untitled'

	# grabbing only 3 pieces of metadata for printing DOI, pub_date, and title
	date = "-".join([str(x).zfill(2) for x in item['indexed']['date-parts'][0]])
	lines.append(' '.join([doi, date, title, '\n']))

	except Exception, e:
	print e
	# 3 exceptions in a row and we give up
	if tries > 3:
	print "failed to fetch URL after 3 tries"
	break

	tries += 1

	# just pause and the loop will try again
	sleep(3)
	pass


	# this prints it out in the format DOI pub_date title
	if len(lines):
	with open('%s/%s_%s.txt' % (out_dir, prefix, datetime.datetime.now().strftime("%Y-%m-%d-%H%M%S.%f")), 'w') as ofile:
	ofile.writelines([line.encode('utf8', 'ignore') for line in lines])