Skip to content

Instantly share code, notes, and snippets.

@alanorth
Last active November 6, 2018 07:42
Show Gist options
  • Save alanorth/57a88379126d844563c1410bd7b8d12b to your computer and use it in GitHub Desktop.
Save alanorth/57a88379126d844563c1410bd7b8d12b to your computer and use it in GitHub Desktop.
Query the public ORCID API for names associated with ORCID identifiers, optionally from a DSpace authority Solr core.
#!/usr/bin/env python
#
# resolve-orcids.py 1.1.0
#
# Copyright 2018 Alan Orth.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# ---
#
# Queries the public ORCID API for names associated with a list of ORCID iDs
# read from a text file or DSpace authority Solr core. Text file should have
# one ORCID identifier per line (comments and invalid lines are skipped).
#
# This script is written for Python 3 and requires several modules that you can
# install with pip (I recommend setting up a Python virtual environment first):
#
# $ pip install colorama requests requests-cache SolrClient
#
import argparse
from colorama import Fore
from datetime import timedelta
import re
import requests
import requests_cache
import signal
from SolrClient import SolrClient
import sys
# read ORCID identifiers from a text file, one per line
def read_identifiers_from_file():
# initialize an empty list for ORCID iDs
orcids = []
for line in args.input_file:
# trim any leading or trailing whitespace (including newlines)
line = line.strip()
# regular expression for matching exactly one ORCID identifier on a line
pattern = re.compile('^[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}$')
# skip the line if it doesn't match the pattern
if not pattern.match(line):
continue
# iterate over results and add ORCID iDs that aren't already in the list
if line not in orcids:
orcids.append(line)
# close output file before we exit
args.input_file.close()
resolve_orcid_identifiers(orcids)
# query DSpace's authority Solr core for ORCID identifiers
def read_identifiers_from_solr():
solr = SolrClient(args.solr_url)
# simple query from the 'authority' collection 2000 rows at a time (default is 1000)
# see: https://solrclient.readthedocs.io/en/latest/SolrClient.html
res = solr.query('authority', {'q': 'orcid_id:*'}, rows=2000)
if args.debug:
sys.stderr.write(Fore.GREEN + 'Total number of Solr records with ORCID iDs: {0}\n'.format(str(res.get_num_found())) + Fore.RESET)
# initialize an empty list for ORCID iDs
orcids = []
# iterate over results and add ORCID iDs that aren't already in the list
# for example, we had 1600 ORCID iDs in Solr, but only 600 are unique
for doc in res.docs:
if doc['orcid_id'] not in orcids:
orcids.append(doc['orcid_id'])
# if the user requested --extract-only, write the current ORCID iD to output_file
if args.extract_only:
line = doc['orcid_id'] + '\n'
args.output_file.write(line)
# exit now if the user requested --extract-only
if args.extract_only:
if args.debug:
sys.stderr.write(Fore.GREEN + 'Number of unique ORCID iDs: {0}\n'.format(str(len(orcids))) + Fore.RESET)
# close output file before we exit
args.output_file.close()
exit()
resolve_orcid_identifiers(orcids)
# Query ORCID's public API for names associated with identifiers. Prefers to use
# the "credit-name" field if it is present, otherwise will default to using the
# "given-names" and "family-name" fields.
def resolve_orcid_identifiers(orcids):
if args.debug:
sys.stderr.write(Fore.GREEN + 'Resolving names associated with {0} unique ORCID iDs.\n\n'.format(str(len(orcids))) + Fore.RESET)
# ORCID API endpoint, see: https://pub.orcid.org
orcid_api_base_url = 'https://pub.orcid.org/v2.1/'
orcid_api_endpoint = '/person'
# iterate through our ORCID iDs and fetch their names from the ORCID API
for orcid in orcids:
if args.debug:
sys.stderr.write(Fore.GREEN + 'Looking up the names associated with ORCID iD: {0}\n'.format(orcid) + Fore.RESET)
# enable transparent request cache with 24 hour expiry
expire_after = timedelta(hours=72)
# cache HTTP 200 and 404 responses, because ORCID uses HTTP 404 when an identifier doesn't exist
requests_cache.install_cache('orcid-response-cache', expire_after=expire_after, allowable_codes=(200, 404))
# build request URL for current ORCID ID
request_url = orcid_api_base_url + orcid.strip() + orcid_api_endpoint
# ORCID's API defaults to some custom format, so tell it to give us JSON
request = requests.get(request_url, headers={'Accept': 'application/json'})
# prune old cache entries
requests_cache.core.remove_expired_responses()
# Check the request status
if request.status_code == requests.codes.ok:
# read response JSON into data
data = request.json()
# make sure name element is not null
if data['name']:
# prefer to use credit-name if present and not blank
if data['name']['credit-name'] and data['name']['credit-name']['value'] != '':
line = data['name']['credit-name']['value']
# otherwise try to use given-names and or family-name
else:
# make sure given-names is present and not deactivated
if data['name']['given-names'] and data['name']['given-names']['value'] != 'Given Names Deactivated':
line = data['name']['given-names']['value']
else:
if args.debug:
sys.stderr.write(Fore.YELLOW + 'Warning: ignoring null or deactivated given-names element.\n' + Fore.RESET)
# make sure family-name is present and not deactivated
if data['name']['family-name'] and data['name']['family-name']['value'] != 'Family Name Deactivated':
line = line + ' ' + data['name']['family-name']['value']
else:
if args.debug:
sys.stderr.write(Fore.YELLOW + 'Warning: ignoring null or deactivated family-name element.\n' + Fore.RESET)
# check if line has something (a credit-name, given-names, and or family-name)
if line and line != '':
line = '{0}: {1}'.format(line.strip(), orcid)
else:
if args.debug:
sys.stderr.write(Fore.RED + 'Error: skipping identifier with no valid name elements.\n' + Fore.RESET)
continue
# output results to screen to show progress
if not args.quiet:
print(line)
# write formatted name and ORCID identifier to output file
args.output_file.write(line + '\n')
# clear line for next iteration
line = None
else:
if args.debug:
sys.stderr.write(Fore.YELLOW + 'Warning: skipping identifier with null name element.\n\n' + Fore.RESET)
# HTTP 404 means that the API url or identifier was not found. If the
# API URL is correct, let's assume that the identifier was not found.
elif request.status_code == 404:
if args.debug:
sys.stderr.write(Fore.YELLOW + 'Warning: skipping missing identifier (API request returned HTTP 404).\n\n' + Fore.RESET)
continue
# HTTP 409 means that the identifier is locked for some reason
# See: https://members.orcid.org/api/resources/error-codes
elif request.status_code == 409:
if args.debug:
sys.stderr.write(Fore.YELLOW + 'Warning: skipping locked identifier (API request returned HTTP 409).\n\n' + Fore.RESET)
continue
else:
sys.stderr.write(Fore.RED + 'Error: request failed.\n' + Fore.RESET)
# close output file before we exit
args.output_file.close()
exit(1)
# close output file before we exit
args.output_file.close()
def signal_handler(signal, frame):
# close output file before we exit
args.output_file.close()
sys.exit(1)
parser = argparse.ArgumentParser(description='Query the public ORCID API for names associated with a list of ORCID identifiers, either from a text file or a DSpace authority Solr core. Optional "extract only" mode will simply fetch the ORCID identifiers from Solr and write them to the output file without resolving their names from ORCID\'s API.')
parser.add_argument('--debug', '-d', help='Print debug messages to standard error (stderr).', action='store_true')
parser.add_argument('--extract-only', '-e', help='If fetching ORCID identifiers from Solr, write them to the output file without resolving their names from the ORCID API.', action='store_true')
parser.add_argument('--output-file', '-o', help='Name of output file to write to.', required=True, type=argparse.FileType('w', encoding='UTF-8'))
parser.add_argument('--quiet', '-q', help='Do not print results to screen as we find them (results will still go to output file).', action='store_true')
# group of mutually exclusive options
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--input-file', '-i', help='File name containing ORCID identifiers to resolve.', type=argparse.FileType('r'))
group.add_argument('--solr-url', '-s', help='URL of Solr application (for example: http://localhost:8080/solr).')
args = parser.parse_args()
# set the signal handler for SIGINT (^C) so we can exit cleanly
signal.signal(signal.SIGINT, signal_handler)
# if the user specified an input file, get the ORCID identifiers from there
if args.input_file:
read_identifiers_from_file()
# otherwise, get the ORCID identifiers from Solr
elif args.solr_url:
read_identifiers_from_solr()
exit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment