shyamd/collab_list.py

## collab_list.py
import click
from datetime import datetime, timedelta
from itertools import chain
from tqdm import tqdm
from pandas import DataFrame
from pybliometrics.scopus import (
    AuthorRetrieval,
    AuthorSearch,
    ScopusSearch,
    ContentAffiliationRetrieval,
)
from monty.serialization import yaml
from io import StringIO


def get_author_auid(auid):
    """
    Gets the author record by auid
    """
    if auid.startswith("a:"):
        auid = auid[2:]
    return AuthorRetrieval(auid)


def get_author(ocid):
    """
    Gets the author record given an ORCID
    """
    au_search = AuthorSearch(f"orcid({ocid})")

    if au_search.authors:
        au = au_search.authors[0]
        if hasattr(au, "auid"):
            return get_author_auid(au.auid)
        elif hasattr(au, "eid"):
            return get_author_auid(au.eid)
    else:
        return None


def get_affiliatoin(auid):
    """
    Helper function to get affiliation because author.affiliation_current from AuthorRetreival appears to very buggy
    """
    author = AuthorSearch(f"au-id({auid})").authors[0]
    return author.affiliation


def read_input(data):
    """
    Pre-sanitize input from a number of formats
    """
    data = data.read()
    try:
        fd = StringIO(data)
        ids = yaml.safe_load(fd)
        if isinstance(ids, str):
            if "," in ids:
                ids = ids.split(",")
            else:
                ids = ids.split()
    except:
        if "," in data:
            ids = data.split(",")
        else:

            ids = data.split()

    return ids


@click.command()
@click.option(
    "-y", "--years_back", default=4, type=int, help="Number of years back to include"
)
@click.option(
    "-o",
    "--output",
    default="collabs.csv",
    help="CSV file to output the collaborator list to",
)
@click.option(
    "--induvidual",
    default=False,
    is_flag=True,
    help="Generate induvidual collaborator lists for the authors",
)
@click.argument("ids", type=click.File("r"))
def run(ids, years_back, output, induvidual):
    """
    Builds a a collaborator list from a list of ORCIDs or Scopus AU-IDs for authors given as a space or comma seperated list

    Example:\n
        echo DUMMY-OC-ID | python collab_list.py -\n
        echo DUMMY-OC-ID1 DUMMY-OC-ID2 | python collab_list.py -\n
        echo a:DUMMY_AUID | python collab_list.py -\n
        python collab_list.py oc_id_list.txt\n
    """

    ids = read_input(ids)

    oldest = datetime.utcnow() - timedelta(days=365.25 * years_back)

    authors = []
    coauthors = []

    # Get Author Objects for everyone involved
    authors = []
    tqdm.write(f"Getting Author IDs for {len(ids)} authors")
    for sub_id in tqdm(ids, desc="Getting Author List"):
        if sub_id.startswith("a:"):
            author = get_author_auid(sub_id)
        else:
            author = get_author(sub_id)

        if author:
            authors.append(author)
        else:
            print(f"Found no records for {sub_id}")

    # Get map of author to CoAuthor IDs for all CoAuthors from publications beyond years_back
    coauthor_map = {}
    for author in tqdm(authors, desc="Getting Publication List For Authors"):
        coauthor_map[author.identifier] = []
        coauthor_ids = coauthor_map[author.identifier]
        docs = author.get_documents()
        for d in docs:
            date = datetime.fromisoformat(d.coverDate)
            if date > oldest:
                coauthor_ids.extend(d.author_ids.split(";"))

    # All coauthor ids
    all_coauthor_ids = list(chain.from_iterable(coauthor_map.values()))

    tqdm.write(
        f"Found {len(set(all_coauthor_ids))} coauthors with "
        f"{len(all_coauthor_ids)-len(set(all_coauthor_ids))} duplicates"
    )

    # Remove duplicates and author ids
    all_coauthor_ids = set(all_coauthor_ids) - {author.identifier for author in authors}

    # Convert to Author Objects
    coauthors = {
        coauth_id: AuthorRetrieval(coauth_id)
        for coauth_id in tqdm(all_coauthor_ids, desc="Getting Co-Authors")
    }

    # Get master affiliation list
    affiliatons = {}
    for author in tqdm(authors, desc="Getting Author Affiliatons"):
        affiliatons[author.identifier] = get_affiliatoin(author.identifier)

    author_affiliations = set(affiliatons.values())

    for coauthor_id in tqdm(coauthors.keys(), desc="Getting CoAuthor Affiliatons"):
        affiliatons[coauthor_id] = get_affiliatoin(coauthor_id)

    collab_list = [
        (coauthor.surname, coauthor.given_name, affiliatons[coauthor_id])
        for coauthor_id, coauthor in coauthors.items()
        if affiliatons[coauthor_id] not in author_affiliations
    ]

    df = DataFrame(collab_list, columns=["lastname", "firstname", "institution"])
    df.to_csv(output)

    if induvidual:
        for author_id, coauthor_ids in coauthor_map.items():
            author_surname = get_author_auid(author_id).surname
            collab_list = [
                (coauthor.surname, coauthor.given_name, affiliatons[coauthor_id])
                for coauthor_id, coauthor in coauthors.items()
                if (coauthor_id in coauthor_ids)
                and (affiliatons[coauthor_id] not in author_affiliations)
            ]

            df = DataFrame(
                collab_list, columns=["lastname", "firstname", "institution"]
            )
            output_name = ".".join(
                [output.split(".csv")[0], author_surname.replace(" ", "."), "csv"]
            )
            df.to_csv(output_name)


if __name__ == "__main__":
    run()
	import click
	from datetime import datetime, timedelta
	from itertools import chain
	from tqdm import tqdm
	from pandas import DataFrame
	from pybliometrics.scopus import (
	AuthorRetrieval,
	AuthorSearch,
	ScopusSearch,
	ContentAffiliationRetrieval,
	)
	from monty.serialization import yaml
	from io import StringIO


	def get_author_auid(auid):
	"""
	Gets the author record by auid
	"""
	if auid.startswith("a:"):
	auid = auid[2:]
	return AuthorRetrieval(auid)


	def get_author(ocid):
	"""
	Gets the author record given an ORCID
	"""
	au_search = AuthorSearch(f"orcid({ocid})")

	if au_search.authors:
	au = au_search.authors[0]
	if hasattr(au, "auid"):
	return get_author_auid(au.auid)
	elif hasattr(au, "eid"):
	return get_author_auid(au.eid)
	else:
	return None


	def get_affiliatoin(auid):
	"""
	Helper function to get affiliation because author.affiliation_current from AuthorRetreival appears to very buggy
	"""
	author = AuthorSearch(f"au-id({auid})").authors[0]
	return author.affiliation


	def read_input(data):
	"""
	Pre-sanitize input from a number of formats
	"""
	data = data.read()
	try:
	fd = StringIO(data)
	ids = yaml.safe_load(fd)
	if isinstance(ids, str):
	if "," in ids:
	ids = ids.split(",")
	else:
	ids = ids.split()
	except:
	if "," in data:
	ids = data.split(",")
	else:

	ids = data.split()

	return ids


	@click.command()
	@click.option(
	"-y", "--years_back", default=4, type=int, help="Number of years back to include"
	)
	@click.option(
	"-o",
	"--output",
	default="collabs.csv",
	help="CSV file to output the collaborator list to",
	)
	@click.option(
	"--induvidual",
	default=False,
	is_flag=True,
	help="Generate induvidual collaborator lists for the authors",
	)
	@click.argument("ids", type=click.File("r"))
	def run(ids, years_back, output, induvidual):
	"""
	Builds a a collaborator list from a list of ORCIDs or Scopus AU-IDs for authors given as a space or comma seperated list

	Example:\n
	echo DUMMY-OC-ID \| python collab_list.py -\n
	echo DUMMY-OC-ID1 DUMMY-OC-ID2 \| python collab_list.py -\n
	echo a:DUMMY_AUID \| python collab_list.py -\n
	python collab_list.py oc_id_list.txt\n
	"""

	ids = read_input(ids)

	oldest = datetime.utcnow() - timedelta(days=365.25 * years_back)

	authors = []
	coauthors = []

	# Get Author Objects for everyone involved
	authors = []
	tqdm.write(f"Getting Author IDs for {len(ids)} authors")
	for sub_id in tqdm(ids, desc="Getting Author List"):
	if sub_id.startswith("a:"):
	author = get_author_auid(sub_id)
	else:
	author = get_author(sub_id)

	if author:
	authors.append(author)
	else:
	print(f"Found no records for {sub_id}")

	# Get map of author to CoAuthor IDs for all CoAuthors from publications beyond years_back
	coauthor_map = {}
	for author in tqdm(authors, desc="Getting Publication List For Authors"):
	coauthor_map[author.identifier] = []
	coauthor_ids = coauthor_map[author.identifier]
	docs = author.get_documents()
	for d in docs:
	date = datetime.fromisoformat(d.coverDate)
	if date > oldest:
	coauthor_ids.extend(d.author_ids.split(";"))

	# All coauthor ids
	all_coauthor_ids = list(chain.from_iterable(coauthor_map.values()))

	tqdm.write(
	f"Found {len(set(all_coauthor_ids))} coauthors with "
	f"{len(all_coauthor_ids)-len(set(all_coauthor_ids))} duplicates"
	)

	# Remove duplicates and author ids
	all_coauthor_ids = set(all_coauthor_ids) - {author.identifier for author in authors}

	# Convert to Author Objects
	coauthors = {
	coauth_id: AuthorRetrieval(coauth_id)
	for coauth_id in tqdm(all_coauthor_ids, desc="Getting Co-Authors")
	}

	# Get master affiliation list
	affiliatons = {}
	for author in tqdm(authors, desc="Getting Author Affiliatons"):
	affiliatons[author.identifier] = get_affiliatoin(author.identifier)

	author_affiliations = set(affiliatons.values())

	for coauthor_id in tqdm(coauthors.keys(), desc="Getting CoAuthor Affiliatons"):
	affiliatons[coauthor_id] = get_affiliatoin(coauthor_id)

	collab_list = [
	(coauthor.surname, coauthor.given_name, affiliatons[coauthor_id])
	for coauthor_id, coauthor in coauthors.items()
	if affiliatons[coauthor_id] not in author_affiliations
	]

	df = DataFrame(collab_list, columns=["lastname", "firstname", "institution"])
	df.to_csv(output)

	if induvidual:
	for author_id, coauthor_ids in coauthor_map.items():
	author_surname = get_author_auid(author_id).surname
	collab_list = [
	(coauthor.surname, coauthor.given_name, affiliatons[coauthor_id])
	for coauthor_id, coauthor in coauthors.items()
	if (coauthor_id in coauthor_ids)
	and (affiliatons[coauthor_id] not in author_affiliations)
	]

	df = DataFrame(
	collab_list, columns=["lastname", "firstname", "institution"]
	)
	output_name = ".".join(
	[output.split(".csv")[0], author_surname.replace(" ", "."), "csv"]
	)
	df.to_csv(output_name)


	if __name__ == "__main__":
	run()