computron/co_authors.py

## co_authors.py
"""
OVERVIEW: This script will automatically export a list of all your co-authors and
their institutions to an Excel file based on information already in the Scopus database.

LIMITATIONS:
1. Only up to 150 collaborators are supported by Scopus.
2. Sometimes, you want to filter by collaborators for only the last 4 years. Unfortunately,
there is no simple way to do this.

INSTRUCTIONS:

0. Make sure you have the "pandas" and "bs4" (BeautifulSoup) libraries installed
1. Find your author profile on scopus.com, e.g. https://www.scopus.com/authid/detail.uri?authorId=7404463800
2. There should be some text above some other results that states how many co-authors you have, (i) click that text AND (ii) also click "View in Search Results Format"
3. Click "exclude" to exclude institutions from the proposal (e.g., your home institution) or other exclusionary factors. Use the sidebar to click some attributes then click "Exclude".
--->Remember you are limited to 150 results. So excluding results might be important!!
4. Make sure you display 200 results per page so you get everything
5. Export the HTML page as a file called "coauthors.html" and move it to your Desktop.

RUN!! You should see a file called coauthors.xlsx appear in the Desktop.
"""


def cond(x):
    # condition for a table row actually containing data!
    if x:
        return x.startswith("resultDataRow")
    else:
        return False


if __name__ == "__main__":
    import os

    import bs4 as bs
    from pandas import DataFrame

    in_path = os.path.expanduser("~/Desktop/coauthors.html")  # location containing the input HTML file from Scopus
    out_path = os.path.expanduser("~/Desktop/coauthors.xlsx")  # location to place the output Excel file
    sort_col = "institution"  # sort the data at the end by this column

    source = None
    with open(in_path) as f:
        source = f.read()

    soup = bs.BeautifulSoup(source, features="lxml")

    raw_data = []
    table = soup.find('table', id='srchResultsList')
    rows = table.find_all('tr', id=cond)
    for row in rows:
        cols = row.find_all('td')
        cols = [elem.text.strip() for elem in cols]
        raw_data.append([elem for elem in cols]) # Get rid of empty values

    processed_data = []

    for x in raw_data:
        author = x[0].split("\n")[0]
        lastname = author.split(",")[0]
        firstname = author.split(",")[1]
        institution = x[3]
        processed_data.append([lastname, firstname, institution])


    df = DataFrame(processed_data, columns=["lastname", "firstname", "institution"])
    df.sort_values(by=[sort_col], inplace=True)
    df.to_excel(os.path.expanduser(out_path), index=False)
	"""
	OVERVIEW: This script will automatically export a list of all your co-authors and
	their institutions to an Excel file based on information already in the Scopus database.

	LIMITATIONS:
	1. Only up to 150 collaborators are supported by Scopus.
	2. Sometimes, you want to filter by collaborators for only the last 4 years. Unfortunately,
	there is no simple way to do this.

	INSTRUCTIONS:

	0. Make sure you have the "pandas" and "bs4" (BeautifulSoup) libraries installed
	1. Find your author profile on scopus.com, e.g. https://www.scopus.com/authid/detail.uri?authorId=7404463800
	2. There should be some text above some other results that states how many co-authors you have, (i) click that text AND (ii) also click "View in Search Results Format"
	3. Click "exclude" to exclude institutions from the proposal (e.g., your home institution) or other exclusionary factors. Use the sidebar to click some attributes then click "Exclude".
	--->Remember you are limited to 150 results. So excluding results might be important!!
	4. Make sure you display 200 results per page so you get everything
	5. Export the HTML page as a file called "coauthors.html" and move it to your Desktop.

	RUN!! You should see a file called coauthors.xlsx appear in the Desktop.
	"""


	def cond(x):
	# condition for a table row actually containing data!
	if x:
	return x.startswith("resultDataRow")
	else:
	return False


	if __name__ == "__main__":
	import os

	import bs4 as bs
	from pandas import DataFrame

	in_path = os.path.expanduser("~/Desktop/coauthors.html") # location containing the input HTML file from Scopus
	out_path = os.path.expanduser("~/Desktop/coauthors.xlsx") # location to place the output Excel file
	sort_col = "institution" # sort the data at the end by this column

	source = None
	with open(in_path) as f:
	source = f.read()

	soup = bs.BeautifulSoup(source, features="lxml")

	raw_data = []
	table = soup.find('table', id='srchResultsList')
	rows = table.find_all('tr', id=cond)
	for row in rows:
	cols = row.find_all('td')
	cols = [elem.text.strip() for elem in cols]
	raw_data.append([elem for elem in cols]) # Get rid of empty values

	processed_data = []

	for x in raw_data:
	author = x[0].split("\n")[0]
	lastname = author.split(",")[0]
	firstname = author.split(",")[1]
	institution = x[3]
	processed_data.append([lastname, firstname, institution])


	df = DataFrame(processed_data, columns=["lastname", "firstname", "institution"])
	df.sort_values(by=[sort_col], inplace=True)
	df.to_excel(os.path.expanduser(out_path), index=False)