Last active
June 1, 2021 03:20
-
-
Save Jason-Gush/bcbab1c3c55e5684251ad3b8ee04eded to your computer and use it in GitHub Desktop.
Using the NZ ORCID Hub's API to compile ORCID works
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Python3.6 | |
# Use an access token from the Hub to retrieve (all) users | |
# With the list of users, use the ORCID API proxy to retrieve their work summaries | |
# Done handle 401 responses by using a public read directly from ORCID | |
# Done change from json out to tsv | |
# Done make UTF-8 safe, much easier than I thought but seems to slow operation | |
# Done use time.sleep to control request/min and prevent overloading ORCID | |
# Was 8/sec now max 20/sec to be safe for V2 and V3) | |
# From the work summaries, call each work individually to get the needed metadata missing from the summary | |
# Write row at time to allow recovery from error and add skip_orcid to allow resume at specific ORCID ID | |
# Use public API for bulk of work reads, only handing Trusted Party and Private calls back to the Hub's API | |
import requests | |
import time | |
import json | |
def fetch_hub_access_token(app_id, secret, service): | |
auth = {'client_id': app_id, 'client_secret': secret, 'grant_type': 'client_credentials'} | |
response = requests.post( | |
url='https://' + service + '/oauth/token', | |
data=auth | |
) | |
if response.status_code == 200: | |
hub_access_token = response.json()['access_token'] | |
return hub_access_token | |
else: | |
raise Exception('Error: {0} {1}'.format(response.status_code, response.reason)) | |
with open('Hub API credentials.json') as config_file: | |
API_credentials = json.load(config_file) | |
client_id = API_credentials["client_id"] | |
client_secret = API_credentials["client_secret"] | |
environment = API_credentials["service"] | |
access_token = fetch_hub_access_token(client_id, client_secret, environment) | |
public_orcid_access_token = API_credentials["public_access_token"] | |
public_headers = {'accept': 'application/json', 'authorization': 'Bearer ' + public_orcid_access_token} | |
ORCID_API_version = API_credentials['orcid_version'] | |
if environment.lower() == "test.orcidhub.org.nz": | |
hub_url = 'https://test.orcidhub.org.nz/api/v1/' | |
member_orcid_url = 'https://test.orcidhub.org.nz/orcid/api/' + ORCID_API_version + '/' | |
member_headers = {'accept': 'application/json', 'authorization': 'Bearer ' + access_token} | |
pub_orcid_url = 'https://pub.sandbox.orcid.org/' + ORCID_API_version + '/' | |
elif environment.lower() == "orcidhub.org.nz": | |
hub_url = 'https://orcidhub.org.nz/api/v1/' | |
member_orcid_url = 'https://orcidhub.org.nz/orcid/api/' + ORCID_API_version + '/' | |
member_headers = {'accept': 'application/json', 'authorization': 'Bearer ' + access_token} | |
pub_orcid_url = 'https://pub.orcid.org/' + ORCID_API_version + '/' | |
else: | |
raise Exception('Invalid config environment: specify the service as either test.orcidhub.org.nz or orcidhub.org.nz') | |
def fetch_hub_users(): | |
headers = {'accept': 'application/json', 'authorization': 'Bearer ' + access_token} | |
body = [] | |
users_on_page = 1 | |
page = 1 | |
while users_on_page > 0: | |
response = requests.get(hub_url + 'users', headers=headers, params={'page': page}) | |
if response.status_code == 200: | |
users_on_page = int(response.headers['pagination-count']) | |
for users in range(0, users_on_page-1): | |
body.append(response.json()[users]) | |
page += 1 | |
elif response.status_code == 401: | |
raise Exception('Access token expired/revoked get a new one') | |
else: | |
raise Exception('Error: {0} {1}'.format(response.status_code, response.reason)) | |
return body | |
def fetch_works_list(orcid): | |
response = requests.get(member_orcid_url + orcid + '/works', headers=member_headers) | |
status = response.status_code | |
if status != 200: | |
print(orcid, status, response.reason) | |
time.sleep(0.05) | |
response = requests.get(pub_orcid_url + orcid + '/works', headers=public_headers) | |
return response, status | |
def fetch_work(orcid, putcode, visibility): | |
if visibility != "public": | |
response = requests.get(member_orcid_url + orcid + '/work/' + | |
str(putcode), headers=member_headers) | |
else: | |
time.sleep(0.05) | |
response = requests.get(pub_orcid_url + orcid + '/work/' + | |
str(putcode), headers=public_headers) | |
return response | |
def fetch_orcid_works(skip_orcid=""): | |
pub_summaries = [] | |
hub_users = fetch_hub_users() | |
start_collating = 0 | |
if isinstance(hub_users, list): | |
for users in hub_users: | |
if start_collating == 1 or skip_orcid == "": | |
if users['confirmed'] and users['orcid']: | |
response, status = fetch_works_list(users['orcid']) | |
if status != 200: | |
print(users['orcid'], str(status), response.reason) | |
pass | |
else: | |
print(users['orcid']) | |
pubs = response.json()['group'] | |
if pubs: | |
for pub in pubs: | |
pub_sum = pub['work-summary'][0] | |
putcode = pub_sum['put-code'] | |
source = pub_sum['source']['source-name'] | |
external_id_type = "" | |
external_id_value = "" | |
if source: | |
source = source['value'] | |
try: | |
title = pub_sum['title']['title']['value'] | |
except (IndexError, TypeError): | |
title = "No Title" | |
try: | |
if len(pub_sum['external-ids']['external-id'][0]['external-id-value']) > 0: | |
external_id_value = \ | |
pub_sum['external-ids']['external-id'][0]['external-id-value'] | |
external_id_type = pub_sum['external-ids']['external-id'][0]['external-id-type'] | |
except (IndexError, TypeError): | |
external_id_value = "" | |
external_id_type = "" | |
pub_type = pub_sum['type'] | |
try: | |
pub_year = pub_sum['publication-date']['year']['value'] | |
except (IndexError, TypeError): | |
pub_year = "" | |
visibility = pub_sum['visibility'] | |
pub_summaries.append({'email': users['email'], 'orcid': users['orcid'], | |
'status': status, 'put-code': putcode, 'source': source, | |
'title': title, 'external-id-type': external_id_type, | |
'external-id-value': external_id_value, 'type': pub_type, | |
'publication-date-year': pub_year, 'visibility': visibility, | |
'note': 'OK'}) | |
else: | |
pub_summaries.append({'email': users['email'], 'orcid': users['orcid'], 'status': status, | |
'put-code': '', 'source': '', 'title': '', 'external-id-type': '', | |
'external-id-value': '', 'type': '', 'publication-date-year': '', | |
'visibility': '', 'note': 'No visible works'}) | |
else: | |
pub_summaries.append({'email': users['email'], 'orcid': users['orcid'], 'status': '', | |
'put-code': '', 'source': '', 'title': '', 'external-id-type': '', | |
'external-id-value': '', 'type': '', 'publication-date-year': '', | |
'visibility': '', 'note': 'ORCID iD not confirmed by Hub'}) | |
else: | |
if users['orcid'] == skip_orcid: | |
start_collating = 1 | |
pass | |
return pub_summaries | |
def fetch_work_details(skip_orcid=""): | |
outfile = open('organisation_works.tsv', 'w', encoding='utf-8') | |
row_headings = 'Email\tORCID iD\tResponse\tPut-Code\tVisibility\tSource\tTitle\tJournal Title\tShort Description' \ | |
'\tCitation Type\tCitation Value\tExternal ID Type\tExternal ID Value\tExternal ID URL' \ | |
'\tExternal ID Relationship\tType\tPublication Year\tURL\tLanguage Code\tCountry\tNote\n' | |
outfile.write(row_headings) | |
pub_summaries = fetch_orcid_works(skip_orcid) | |
i = 0 | |
for record in pub_summaries: | |
email = record['email'] | |
orcid = record['orcid'] | |
status = record['status'] | |
putcode = record['put-code'] | |
visibility = record['visibility'] | |
source = record['source'] | |
title = record['title'] | |
external_id_type = record['external-id-type'] | |
external_id_value = record['external-id-value'] | |
external_id_url = "" | |
external_id_relationship = "" | |
pub_type = record['type'] | |
pub_year = record['publication-date-year'] | |
short_description = "" | |
journal_title = "" | |
citation_type = "" | |
citation_value = "" | |
url = "" | |
language_code = "" | |
country = "" | |
note = record['note'] | |
if record['put-code'] and record['status'] == 200: | |
print(orcid + "/work/" + str(putcode)) | |
response = fetch_work(record['orcid'], record['put-code'], record['visibility']) | |
work = response.json() | |
try: | |
short_description = work['short-description'] | |
except (IndexError, TypeError, KeyError): | |
pass | |
try: | |
journal_title = work['journal-title']['value'] | |
except (IndexError, TypeError): | |
pass | |
try: | |
if len(work['citation']) > 0: | |
citation_type = work['citation']['citation-type'] | |
citation_value = work['citation']['citation-value'] | |
citation_value = citation_value.replace('\n', "") | |
citation_value = citation_value.strip() | |
except (IndexError, TypeError): | |
pass | |
try: | |
external_id_url = work['external-ids']['external-id'][0]['external-id-url']['value'] | |
except (IndexError, TypeError): | |
pass | |
if external_id_value: | |
external_id_relationship = work['external-ids']['external-id'][0]['external-id-relationship'] | |
try: | |
url = work['url']['value'] | |
except (IndexError, TypeError): | |
pass | |
language_code = work['language-code'] | |
try: | |
country = work['country']['value'] | |
except (IndexError, TypeError): | |
pass | |
work_data = {'email': email, 'orcid': orcid, 'status': status, 'put-code': putcode, | |
'visibility': visibility, 'source': source, 'title': title, 'journal-title': journal_title, | |
'short-description': short_description, 'citation-type': citation_type, | |
'citation-value': citation_value, 'external-id-type': external_id_type, | |
'external-id-value': external_id_value, 'external-id-url': external_id_url, | |
'external-id-relationship': external_id_relationship, 'type': pub_type, | |
'publication-date-year': pub_year, 'url': url, 'language-code': language_code, | |
'country': country, 'note': note} | |
row_data = '\t'.join(str(work_data[value]) for value in work_data) + '\n' | |
outfile.write(row_data) | |
i += 1 | |
outfile.close() | |
return i | |
def write_works(skip_orcid=""): | |
row_counter = fetch_work_details(skip_orcid) | |
return str(row_counter) + ' rows written to file' | |
print(write_works(skip_orcid="")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment