Created
January 8, 2021 02:22
-
-
Save shyamd/7f8f03d675a874b4884d696d3ef059c9 to your computer and use it in GitHub Desktop.
Script to generator a collaborator list for a set of authors using the SCOPUS API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import click | |
from datetime import datetime, timedelta | |
from itertools import chain | |
from tqdm import tqdm | |
from pandas import DataFrame | |
from pybliometrics.scopus import ( | |
AuthorRetrieval, | |
AuthorSearch, | |
ScopusSearch, | |
ContentAffiliationRetrieval, | |
) | |
from monty.serialization import yaml | |
from io import StringIO | |
def get_author_auid(auid): | |
""" | |
Gets the author record by auid | |
""" | |
if auid.startswith("a:"): | |
auid = auid[2:] | |
return AuthorRetrieval(auid) | |
def get_author(ocid): | |
""" | |
Gets the author record given an ORCID | |
""" | |
au_search = AuthorSearch(f"orcid({ocid})") | |
if au_search.authors: | |
au = au_search.authors[0] | |
if hasattr(au, "auid"): | |
return get_author_auid(au.auid) | |
elif hasattr(au, "eid"): | |
return get_author_auid(au.eid) | |
else: | |
return None | |
def get_affiliatoin(auid): | |
""" | |
Helper function to get affiliation because author.affiliation_current from AuthorRetreival appears to very buggy | |
""" | |
author = AuthorSearch(f"au-id({auid})").authors[0] | |
return author.affiliation | |
def read_input(data): | |
""" | |
Pre-sanitize input from a number of formats | |
""" | |
data = data.read() | |
try: | |
fd = StringIO(data) | |
ids = yaml.safe_load(fd) | |
if isinstance(ids, str): | |
if "," in ids: | |
ids = ids.split(",") | |
else: | |
ids = ids.split() | |
except: | |
if "," in data: | |
ids = data.split(",") | |
else: | |
ids = data.split() | |
return ids | |
@click.command() | |
@click.option( | |
"-y", "--years_back", default=4, type=int, help="Number of years back to include" | |
) | |
@click.option( | |
"-o", | |
"--output", | |
default="collabs.csv", | |
help="CSV file to output the collaborator list to", | |
) | |
@click.option( | |
"--induvidual", | |
default=False, | |
is_flag=True, | |
help="Generate induvidual collaborator lists for the authors", | |
) | |
@click.argument("ids", type=click.File("r")) | |
def run(ids, years_back, output, induvidual): | |
""" | |
Builds a a collaborator list from a list of ORCIDs or Scopus AU-IDs for authors given as a space or comma seperated list | |
Example:\n | |
echo DUMMY-OC-ID | python collab_list.py -\n | |
echo DUMMY-OC-ID1 DUMMY-OC-ID2 | python collab_list.py -\n | |
echo a:DUMMY_AUID | python collab_list.py -\n | |
python collab_list.py oc_id_list.txt\n | |
""" | |
ids = read_input(ids) | |
oldest = datetime.utcnow() - timedelta(days=365.25 * years_back) | |
authors = [] | |
coauthors = [] | |
# Get Author Objects for everyone involved | |
authors = [] | |
tqdm.write(f"Getting Author IDs for {len(ids)} authors") | |
for sub_id in tqdm(ids, desc="Getting Author List"): | |
if sub_id.startswith("a:"): | |
author = get_author_auid(sub_id) | |
else: | |
author = get_author(sub_id) | |
if author: | |
authors.append(author) | |
else: | |
print(f"Found no records for {sub_id}") | |
# Get map of author to CoAuthor IDs for all CoAuthors from publications beyond years_back | |
coauthor_map = {} | |
for author in tqdm(authors, desc="Getting Publication List For Authors"): | |
coauthor_map[author.identifier] = [] | |
coauthor_ids = coauthor_map[author.identifier] | |
docs = author.get_documents() | |
for d in docs: | |
date = datetime.fromisoformat(d.coverDate) | |
if date > oldest: | |
coauthor_ids.extend(d.author_ids.split(";")) | |
# All coauthor ids | |
all_coauthor_ids = list(chain.from_iterable(coauthor_map.values())) | |
tqdm.write( | |
f"Found {len(set(all_coauthor_ids))} coauthors with " | |
f"{len(all_coauthor_ids)-len(set(all_coauthor_ids))} duplicates" | |
) | |
# Remove duplicates and author ids | |
all_coauthor_ids = set(all_coauthor_ids) - {author.identifier for author in authors} | |
# Convert to Author Objects | |
coauthors = { | |
coauth_id: AuthorRetrieval(coauth_id) | |
for coauth_id in tqdm(all_coauthor_ids, desc="Getting Co-Authors") | |
} | |
# Get master affiliation list | |
affiliatons = {} | |
for author in tqdm(authors, desc="Getting Author Affiliatons"): | |
affiliatons[author.identifier] = get_affiliatoin(author.identifier) | |
author_affiliations = set(affiliatons.values()) | |
for coauthor_id in tqdm(coauthors.keys(), desc="Getting CoAuthor Affiliatons"): | |
affiliatons[coauthor_id] = get_affiliatoin(coauthor_id) | |
collab_list = [ | |
(coauthor.surname, coauthor.given_name, affiliatons[coauthor_id]) | |
for coauthor_id, coauthor in coauthors.items() | |
if affiliatons[coauthor_id] not in author_affiliations | |
] | |
df = DataFrame(collab_list, columns=["lastname", "firstname", "institution"]) | |
df.to_csv(output) | |
if induvidual: | |
for author_id, coauthor_ids in coauthor_map.items(): | |
author_surname = get_author_auid(author_id).surname | |
collab_list = [ | |
(coauthor.surname, coauthor.given_name, affiliatons[coauthor_id]) | |
for coauthor_id, coauthor in coauthors.items() | |
if (coauthor_id in coauthor_ids) | |
and (affiliatons[coauthor_id] not in author_affiliations) | |
] | |
df = DataFrame( | |
collab_list, columns=["lastname", "firstname", "institution"] | |
) | |
output_name = ".".join( | |
[output.split(".csv")[0], author_surname.replace(" ", "."), "csv"] | |
) | |
df.to_csv(output_name) | |
if __name__ == "__main__": | |
run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment