Skip to content

Instantly share code, notes, and snippets.

@shyamd
Created January 8, 2021 02:22
Show Gist options
  • Save shyamd/7f8f03d675a874b4884d696d3ef059c9 to your computer and use it in GitHub Desktop.
Save shyamd/7f8f03d675a874b4884d696d3ef059c9 to your computer and use it in GitHub Desktop.
Script to generator a collaborator list for a set of authors using the SCOPUS API
import click
from datetime import datetime, timedelta
from itertools import chain
from tqdm import tqdm
from pandas import DataFrame
from pybliometrics.scopus import (
AuthorRetrieval,
AuthorSearch,
ScopusSearch,
ContentAffiliationRetrieval,
)
from monty.serialization import yaml
from io import StringIO
def get_author_auid(auid):
"""
Gets the author record by auid
"""
if auid.startswith("a:"):
auid = auid[2:]
return AuthorRetrieval(auid)
def get_author(ocid):
"""
Gets the author record given an ORCID
"""
au_search = AuthorSearch(f"orcid({ocid})")
if au_search.authors:
au = au_search.authors[0]
if hasattr(au, "auid"):
return get_author_auid(au.auid)
elif hasattr(au, "eid"):
return get_author_auid(au.eid)
else:
return None
def get_affiliatoin(auid):
"""
Helper function to get affiliation because author.affiliation_current from AuthorRetreival appears to very buggy
"""
author = AuthorSearch(f"au-id({auid})").authors[0]
return author.affiliation
def read_input(data):
"""
Pre-sanitize input from a number of formats
"""
data = data.read()
try:
fd = StringIO(data)
ids = yaml.safe_load(fd)
if isinstance(ids, str):
if "," in ids:
ids = ids.split(",")
else:
ids = ids.split()
except:
if "," in data:
ids = data.split(",")
else:
ids = data.split()
return ids
@click.command()
@click.option(
"-y", "--years_back", default=4, type=int, help="Number of years back to include"
)
@click.option(
"-o",
"--output",
default="collabs.csv",
help="CSV file to output the collaborator list to",
)
@click.option(
"--induvidual",
default=False,
is_flag=True,
help="Generate induvidual collaborator lists for the authors",
)
@click.argument("ids", type=click.File("r"))
def run(ids, years_back, output, induvidual):
"""
Builds a a collaborator list from a list of ORCIDs or Scopus AU-IDs for authors given as a space or comma seperated list
Example:\n
echo DUMMY-OC-ID | python collab_list.py -\n
echo DUMMY-OC-ID1 DUMMY-OC-ID2 | python collab_list.py -\n
echo a:DUMMY_AUID | python collab_list.py -\n
python collab_list.py oc_id_list.txt\n
"""
ids = read_input(ids)
oldest = datetime.utcnow() - timedelta(days=365.25 * years_back)
authors = []
coauthors = []
# Get Author Objects for everyone involved
authors = []
tqdm.write(f"Getting Author IDs for {len(ids)} authors")
for sub_id in tqdm(ids, desc="Getting Author List"):
if sub_id.startswith("a:"):
author = get_author_auid(sub_id)
else:
author = get_author(sub_id)
if author:
authors.append(author)
else:
print(f"Found no records for {sub_id}")
# Get map of author to CoAuthor IDs for all CoAuthors from publications beyond years_back
coauthor_map = {}
for author in tqdm(authors, desc="Getting Publication List For Authors"):
coauthor_map[author.identifier] = []
coauthor_ids = coauthor_map[author.identifier]
docs = author.get_documents()
for d in docs:
date = datetime.fromisoformat(d.coverDate)
if date > oldest:
coauthor_ids.extend(d.author_ids.split(";"))
# All coauthor ids
all_coauthor_ids = list(chain.from_iterable(coauthor_map.values()))
tqdm.write(
f"Found {len(set(all_coauthor_ids))} coauthors with "
f"{len(all_coauthor_ids)-len(set(all_coauthor_ids))} duplicates"
)
# Remove duplicates and author ids
all_coauthor_ids = set(all_coauthor_ids) - {author.identifier for author in authors}
# Convert to Author Objects
coauthors = {
coauth_id: AuthorRetrieval(coauth_id)
for coauth_id in tqdm(all_coauthor_ids, desc="Getting Co-Authors")
}
# Get master affiliation list
affiliatons = {}
for author in tqdm(authors, desc="Getting Author Affiliatons"):
affiliatons[author.identifier] = get_affiliatoin(author.identifier)
author_affiliations = set(affiliatons.values())
for coauthor_id in tqdm(coauthors.keys(), desc="Getting CoAuthor Affiliatons"):
affiliatons[coauthor_id] = get_affiliatoin(coauthor_id)
collab_list = [
(coauthor.surname, coauthor.given_name, affiliatons[coauthor_id])
for coauthor_id, coauthor in coauthors.items()
if affiliatons[coauthor_id] not in author_affiliations
]
df = DataFrame(collab_list, columns=["lastname", "firstname", "institution"])
df.to_csv(output)
if induvidual:
for author_id, coauthor_ids in coauthor_map.items():
author_surname = get_author_auid(author_id).surname
collab_list = [
(coauthor.surname, coauthor.given_name, affiliatons[coauthor_id])
for coauthor_id, coauthor in coauthors.items()
if (coauthor_id in coauthor_ids)
and (affiliatons[coauthor_id] not in author_affiliations)
]
df = DataFrame(
collab_list, columns=["lastname", "firstname", "institution"]
)
output_name = ".".join(
[output.split(".csv")[0], author_surname.replace(" ", "."), "csv"]
)
df.to_csv(output_name)
if __name__ == "__main__":
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment