computron/co_authors_v3.py

## co_authors_v3.py
"""
OVERVIEW: This script will automatically export a list of all your co-authors and
their institutions to an Excel file based on information already in the Scopus database.

LIMITATIONS:
1. Only up to 150 collaborators are supported by Scopus.
2. Sometimes, you want to filter by collaborators for only the last 4 years. Unfortunately, there
is no simple way to do this up front, but by enabling the years option you can filter at the end.

INSTRUCTIONS (these are actually important!):

0. Make sure your institution has a Scopus subscription. Also make sure you have the "pandas" and "bs4" (BeautifulSoup) Python libraries installed, may also need supplemental libraries like lxml.
1. Find your author profile on scopus.com, e.g. https://www.scopus.com/authid/detail.uri?authorId=7404463800
2. There should be some text above some other results that states how many co-authors you have, (i) click that text AND (ii) also click "View in Search Results Format"
3. Click "exclude" to exclude institutions from the proposal (e.g., your home institution) or other exclusionary factors. Use the sidebar to click some attributes then click "Exclude".
--->Remember you are limited to 150 results. So excluding results might be important!! If you cannot exclude enough authors in one pass, repeat this process twice (with different exclusion filters) so that each time you have only 150 authors max.
4. Make sure you display 200 results per page so you get everything on this page.
5. Export the HTML page as a file called "coauthors.html" and move it to your Desktop. You need to use your browser's "Save As" function to save the page as HTML.
6. In this script, update the variables "in_path", "out_path", "years" (if you want to export year information), and "scopus_id" (if you want to export year information).
7. RUN!! You should see a file called coauthors.xlsx appear in the Desktop.
8. Double check the results, don't just trust the computer to do the right thing ...

"""
from pybliometrics.scopus import ScopusSearch


def cond(x):
    # condition for a table row actually containing data!
    if x:
        return x.startswith("resultDataRow")
    else:
        return False


if __name__ == "__main__":
    import os

    import bs4 as bs
    from pandas import DataFrame

    in_path = os.path.expanduser(
        "~/Desktop/coauthors.html")  # location containing the input HTML file from Scopus
    out_path = os.path.expanduser(
        "~/Desktop/coauthors.xlsx")  # location to place the output Excel file

    find_years = True  # whether to get the last year of joint publication; this can take time so only enable if needed
    scopus_id = 7404463800  # if you enable find_years, you MUST enter your scopus id.

    if find_years and not scopus_id:
        raise RuntimeError(
            "Must enter scopus_id if you enable find_years! If you see this message, update the script with your scopus_id and read the instructions!!")

    sort_col = "year" if find_years else "institution"  # sort the data at the end by this column

    source = None
    with open(in_path) as f:
        source = f.read()

    soup = bs.BeautifulSoup(source, features="lxml")

    raw_data = []
    table = soup.find('table', id='srchResultsList')
    rows = table.find_all('tr', id=cond)
    for row in rows:
        cols = row.find_all('td')
        cols = [elem.text.strip() for elem in cols]
        raw_data.append([elem for elem in cols])  # Get rid of empty values

    processed_data = []

    for x in raw_data:
        author = x[0].split("\n")[0]
        lastname = author.split(",")[0]
        firstname = author.split(",")[1]
        institution = x[3]
        processed_data.append([lastname, firstname, institution])

    df = DataFrame(processed_data,
                   columns=["lastname", "firstname", "institution"])

    if find_years:
        print(
            "Finding Year of last collaboration - this might take up to a few seconds per co-author, "
            "please be patient if Scopus is being slow ... (make a coffee, stretch out, etc...)")

        years = []
        for idx, row in df.iterrows():
            # search for joint publications between yourself and each colleague

            lastname = row['lastname'].strip()
            firstname = row['firstname'].strip()

            ss = ScopusSearch(
                f"AU-ID({scopus_id}) AND AUTHOR-NAME({lastname}, {firstname[0]})")
            year = ss.results[0].coverDate[0:4]

            # unfortunately, colleagues with a space in their last name ("Ping Ong") do not always search properly ...
            # try to get a better result for them by searching just the last part of their name ("Ong")
            try:
                if ' ' in lastname:
                    lastname = lastname.split(' ')[-1]
                    ss = ScopusSearch(
                        f"AU-ID({scopus_id}) AND AUTHOR-NAME({lastname}, {firstname[0]})")
                    year_new = ss.results[0].coverDate[0:4]
                    if int(year_new) > int(year):
                        year = year_new
            except:
                pass

            years.append(year)

        df.insert(len(df.columns), "year", years)

    df.sort_values(by=[sort_col], inplace=True)
    df.to_excel(os.path.expanduser(out_path), index=False)
	"""
	OVERVIEW: This script will automatically export a list of all your co-authors and
	their institutions to an Excel file based on information already in the Scopus database.

	LIMITATIONS:
	1. Only up to 150 collaborators are supported by Scopus.
	2. Sometimes, you want to filter by collaborators for only the last 4 years. Unfortunately, there
	is no simple way to do this up front, but by enabling the years option you can filter at the end.

	INSTRUCTIONS (these are actually important!):

	0. Make sure your institution has a Scopus subscription. Also make sure you have the "pandas" and "bs4" (BeautifulSoup) Python libraries installed, may also need supplemental libraries like lxml.
	1. Find your author profile on scopus.com, e.g. https://www.scopus.com/authid/detail.uri?authorId=7404463800
	2. There should be some text above some other results that states how many co-authors you have, (i) click that text AND (ii) also click "View in Search Results Format"
	3. Click "exclude" to exclude institutions from the proposal (e.g., your home institution) or other exclusionary factors. Use the sidebar to click some attributes then click "Exclude".
	--->Remember you are limited to 150 results. So excluding results might be important!! If you cannot exclude enough authors in one pass, repeat this process twice (with different exclusion filters) so that each time you have only 150 authors max.
	4. Make sure you display 200 results per page so you get everything on this page.
	5. Export the HTML page as a file called "coauthors.html" and move it to your Desktop. You need to use your browser's "Save As" function to save the page as HTML.
	6. In this script, update the variables "in_path", "out_path", "years" (if you want to export year information), and "scopus_id" (if you want to export year information).
	7. RUN!! You should see a file called coauthors.xlsx appear in the Desktop.
	8. Double check the results, don't just trust the computer to do the right thing ...

	"""
	from pybliometrics.scopus import ScopusSearch


	def cond(x):
	# condition for a table row actually containing data!
	if x:
	return x.startswith("resultDataRow")
	else:
	return False


	if __name__ == "__main__":
	import os

	import bs4 as bs
	from pandas import DataFrame

	in_path = os.path.expanduser(
	"~/Desktop/coauthors.html") # location containing the input HTML file from Scopus
	out_path = os.path.expanduser(
	"~/Desktop/coauthors.xlsx") # location to place the output Excel file

	find_years = True # whether to get the last year of joint publication; this can take time so only enable if needed
	scopus_id = 7404463800 # if you enable find_years, you MUST enter your scopus id.

	if find_years and not scopus_id:
	raise RuntimeError(
	"Must enter scopus_id if you enable find_years! If you see this message, update the script with your scopus_id and read the instructions!!")

	sort_col = "year" if find_years else "institution" # sort the data at the end by this column

	source = None
	with open(in_path) as f:
	source = f.read()

	soup = bs.BeautifulSoup(source, features="lxml")

	raw_data = []
	table = soup.find('table', id='srchResultsList')
	rows = table.find_all('tr', id=cond)
	for row in rows:
	cols = row.find_all('td')
	cols = [elem.text.strip() for elem in cols]
	raw_data.append([elem for elem in cols]) # Get rid of empty values

	processed_data = []

	for x in raw_data:
	author = x[0].split("\n")[0]
	lastname = author.split(",")[0]
	firstname = author.split(",")[1]
	institution = x[3]
	processed_data.append([lastname, firstname, institution])

	df = DataFrame(processed_data,
	columns=["lastname", "firstname", "institution"])

	if find_years:
	print(
	"Finding Year of last collaboration - this might take up to a few seconds per co-author, "
	"please be patient if Scopus is being slow ... (make a coffee, stretch out, etc...)")

	years = []
	for idx, row in df.iterrows():
	# search for joint publications between yourself and each colleague

	lastname = row['lastname'].strip()
	firstname = row['firstname'].strip()

	ss = ScopusSearch(
	f"AU-ID({scopus_id}) AND AUTHOR-NAME({lastname}, {firstname[0]})")
	year = ss.results[0].coverDate[0:4]

	# unfortunately, colleagues with a space in their last name ("Ping Ong") do not always search properly ...
	# try to get a better result for them by searching just the last part of their name ("Ong")
	try:
	if ' ' in lastname:
	lastname = lastname.split(' ')[-1]
	ss = ScopusSearch(
	f"AU-ID({scopus_id}) AND AUTHOR-NAME({lastname}, {firstname[0]})")
	year_new = ss.results[0].coverDate[0:4]
	if int(year_new) > int(year):
	year = year_new
	except:
	pass

	years.append(year)

	df.insert(len(df.columns), "year", years)

	df.sort_values(by=[sort_col], inplace=True)
	df.to_excel(os.path.expanduser(out_path), index=False)