stephensekula/HEPAuthorStudy.py

## HEPAuthorStudy.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Name: HEPAuthorStudy
# Description: A script for retrieving "big science" data from INSPIREHEP.net.
#              Created to harvest data for the SMU-in-Taos Cultural Institute,
#              2019, for the course "The Secret City: Los Alamos and the
#              Atomic Age". The idea was to study the number of accelerator-
#              based experiments, and the size of author lists from those
#              experiments, as a function of time from before WWII to well
#              after (present day)
# Copyright (C) 2019 Stephen Jacob Sekula.
#
#

"""A script to use the INSPIRE API to study collaboration size."""

import urllib3
#import request, urlopen, URLError
from bs4 import BeautifulSoup, Comment
import re
import os
import sys
import time
import math
import json
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

# Dataframe
global main_data
main_data = pd.DataFrame(columns=['year', 'nexperimental', 'max_authors', 'max_title', 'max_experiment'])

def search_inspire(url):
    """Do an API search at INSPIRE."""
    http = urllib3.PoolManager()
    try:
        response = http.request('GET', url)
        return response.data
    except urllib3.exceptions.RequestError as error:
        print('URL =', url)
        print('No result. Got an error code:', error)
        quit()


def get_records(year='1974', threshold='1'):

    """
    Year-by-year query. Save data in file on disk named by year. Load
    that file if it already exists.

    INSPIREHEP's API returns up to 250 records at a time. If a query yields
    more, you have to query again up through the total number of groups of
    250 records. This function does that. It first checks if the number of
    records for a query is 250+. If so, it begins a cycle of queries
    starting from the beginning record of the next group, in batches of 250.
    Results are stored in the main_data DataFrame. Restrictions are made
    on the results so that only accelerator-based experiments are considered.
    Also, only public records in journals are considered.
    """

    global main_data

    output_file = f'{year}.h5'
    if os.path.isfile(output_file):
        store = pd.HDFStore(output_file)
        main_data = pd.concat([main_data,store['main_data']])
        store.close()
    else:

        url=f'http://inspirehep.net/search?of=xm&rg=1&ot=001&p=find+date+=+{year}+and+ac+{threshold}%2b'
        result = BeautifulSoup(search_inspire(url), "lxml")

        comments = result.findAll(text=lambda text:isinstance(text, Comment))
        number_of_records = int(re.sub(r'\D', '', comments[0]))
        print(f"   number of records: {number_of_records}")

        results_groups = math.ceil(number_of_records/250)

        all_results = []

        for group in np.arange(results_groups):
            print(f"      processing group {group}/{results_groups}")

            jrec = 250*group + 1

            url=f'http://inspirehep.net/search?of=recjson&rg=250&jrec={jrec}&p=find+date+=+{year}+and+ac+{threshold}%2b&ot=recid,accelerator_experiment,experiment,publication_info,number_of_authors,authors[0],title'
            result = json.loads(search_inspire(url))
            all_results = all_results + result

        # count experimental results only
        count_experimental = 0
        # get max number of authors in year
        max_authors = 0
        max_experiment = "None"
        max_title = "None"

        print(f"      --> number of records available for analysis: {len(all_results)}")
        for record in all_results:
            if record['publication_info'] != None and (record['experiment'] != "None" or record['accelerator_experiment'] != None):
                count_experimental += 1
                if max_authors < record["number_of_authors"]:
                    max_authors = record["number_of_authors"]
                    if isinstance(record["title"], list):
                        max_title = record["title"][0]["title"]
                    if isinstance(record["title"], dict):
                        max_title = record["title"]["title"]
                    if record['experiment'] != None:
                        max_experiment = record["experiment"]
                    elif record['accelerator_experiment'] != None:
                        if isinstance(record["accelerator_experiment"], list):
                            for experiment in record["accelerator_experiment"]:
                                experiment_name = "None"
                                if 'experiment' in experiment:
                                    experiment_name = experiment["experiment"]
                                max_experiment += experiment_name + ","
                        elif isinstance(record["accelerator_experiment"], dict):
                            experiment_name = "None"
                            if "experiment" in record["accelerator_experiment"]:
                                experiment_name = record["accelerator_experiment"]["experiment"]
                            max_experiment = experiment_name

                    if max_experiment == "None":
                        max_experiment = record["authors"][0]["full_name"]
                    #print(count_experimental)
                    #print(max_authors)

        this_data = pd.DataFrame.from_dict({'year': [year], 'nexperimental': [count_experimental], 'max_authors': [max_authors], 'max_title': [max_title], 'max_experiment': [max_experiment]})

        store = pd.HDFStore(output_file)
        store['main_data'] = this_data
        store.close()

        main_data = pd.concat([main_data, this_data])


def timeplot(x,y,xlabel,ylabel,xmin,xmax,xsteps,ymin,ymax,plotname="plot.pdf"):
    # Generate representations of the data
    fig, axes = plt.subplots()
    plt.plot(x,y,linewidth=7.0)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    axes.set_xlim(xmin,xmax)
    axes.set_ylim(ymin,ymax)
    plt.xticks(np.arange(xmin,xmax,xsteps))
    axes.grid()
    plt.tight_layout()
    plt.savefig(plotname)
    plt.close()


if __name__ == '__main__':

    DEFAULT_START=0
    DEFAULT_STOP=0

    years=np.arange(DEFAULT_START,DEFAULT_STOP+1,1)

    try:
        years=np.arange(int(sys.argv[1]),int(sys.argv[2])+1,1)
        DEFAULT_START=int(sys.argv[1])
        DEFAULT_STOP=int(sys.argv[2])
    except:
        print(f"Using default range: {DEFAULT_START}-{DEFAULT_STOP}")

    if DEFAULT_START >= 1935 and DEFAULT_STOP <= 2050:
        for year in years:
            print(f"Retrieving data for year {year}")
            get_records(str(year))

            # fix the indices from stitching together so many DataFrames
            main_data_indices = np.arange(
                0, main_data["year"].count())
            main_data.index = main_data_indices

            print(main_data)
    else:
        # Only do plotting if specific years have not been passed
        for year in np.arange(1935, 2020, 1):
            print(f"Retrieving data for year {year}")
            get_records(str(year))

        # Make some graphics!
        timeplot(x=pd.to_numeric(main_data.year,downcast='integer'),
                 y=pd.to_numeric(main_data.max_authors,downcast='integer'),
                 xlabel="Calendar Year",
                 ylabel="Maximum Number of Authors",
                 xmin=DEFAULT_START,
                 xmax=DEFAULT_STOP,
                 xsteps=10,
                 ymin=0,
                 ymax=6000,
                 plotname="maxauthors_vs_year.pdf")

        timeplot(x=pd.to_numeric(main_data.year,downcast='integer'),
                 y=pd.to_numeric(main_data.max_authors,downcast='integer'),
                 xlabel="Calendar Year",
                 ylabel="Maximum Number of Authors",
                 xmin=1935,
                 xmax=1965,
                 xsteps=5,
                 ymin=0,
                 ymax=60,
                 plotname="maxauthors_vs_year_1935_1965.pdf")


        timeplot(x=pd.to_numeric(main_data.year,downcast='integer'),
                 y=pd.to_numeric(main_data.nexperimental,downcast='integer'),
                 xlabel="Calendar Year",
                 ylabel="Total Published Experimental Results",
                 xmin=DEFAULT_START,
                 xmax=DEFAULT_STOP,
                 xsteps=10,
                 ymin=0,
                 ymax=60000,
                 plotname="nexperimental_vs_year.pdf")

        timeplot(x=pd.to_numeric(main_data.year,downcast='integer'),
                 y=pd.to_numeric(main_data.nexperimental,downcast='integer'),
                 xlabel="Calendar Year",
                 ylabel="Total Published Experimental Results",
                 xmin=1935,
                 xmax=1965,
                 xsteps=10,
                 ymin=0,
                 ymax=2000,
                 plotname="nexperimental_vs_year_1935_1965.pdf")
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	#
	# Name: HEPAuthorStudy
	# Description: A script for retrieving "big science" data from INSPIREHEP.net.
	# Created to harvest data for the SMU-in-Taos Cultural Institute,
	# 2019, for the course "The Secret City: Los Alamos and the
	# Atomic Age". The idea was to study the number of accelerator-
	# based experiments, and the size of author lists from those
	# experiments, as a function of time from before WWII to well
	# after (present day)
	# Copyright (C) 2019 Stephen Jacob Sekula.
	#
	#

	"""A script to use the INSPIRE API to study collaboration size."""

	import urllib3
	#import request, urlopen, URLError
	from bs4 import BeautifulSoup, Comment
	import re
	import os
	import sys
	import time
	import math
	import json
	import pandas as pd
	import numpy as np
	import matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt

	# Dataframe
	global main_data
	main_data = pd.DataFrame(columns=['year', 'nexperimental', 'max_authors', 'max_title', 'max_experiment'])

	def search_inspire(url):
	"""Do an API search at INSPIRE."""
	http = urllib3.PoolManager()
	try:
	response = http.request('GET', url)
	return response.data
	except urllib3.exceptions.RequestError as error:
	print('URL =', url)
	print('No result. Got an error code:', error)
	quit()


	def get_records(year='1974', threshold='1'):

	"""
	Year-by-year query. Save data in file on disk named by year. Load
	that file if it already exists.

	INSPIREHEP's API returns up to 250 records at a time. If a query yields
	more, you have to query again up through the total number of groups of
	250 records. This function does that. It first checks if the number of
	records for a query is 250+. If so, it begins a cycle of queries
	starting from the beginning record of the next group, in batches of 250.
	Results are stored in the main_data DataFrame. Restrictions are made
	on the results so that only accelerator-based experiments are considered.
	Also, only public records in journals are considered.
	"""

	global main_data

	output_file = f'{year}.h5'
	if os.path.isfile(output_file):
	store = pd.HDFStore(output_file)
	main_data = pd.concat([main_data,store['main_data']])
	store.close()
	else:

	url=f'http://inspirehep.net/search?of=xm&rg=1&ot=001&p=find+date+=+{year}+and+ac+{threshold}%2b'
	result = BeautifulSoup(search_inspire(url), "lxml")

	comments = result.findAll(text=lambda text:isinstance(text, Comment))
	number_of_records = int(re.sub(r'\D', '', comments[0]))
	print(f" number of records: {number_of_records}")

	results_groups = math.ceil(number_of_records/250)

	all_results = []

	for group in np.arange(results_groups):
	print(f" processing group {group}/{results_groups}")

	jrec = 250*group + 1

	url=f'http://inspirehep.net/search?of=recjson&rg=250&jrec={jrec}&p=find+date+=+{year}+and+ac+{threshold}%2b&ot=recid,accelerator_experiment,experiment,publication_info,number_of_authors,authors[0],title'
	result = json.loads(search_inspire(url))
	all_results = all_results + result

	# count experimental results only
	count_experimental = 0
	# get max number of authors in year
	max_authors = 0
	max_experiment = "None"
	max_title = "None"

	print(f" --> number of records available for analysis: {len(all_results)}")
	for record in all_results:
	if record['publication_info'] != None and (record['experiment'] != "None" or record['accelerator_experiment'] != None):
	count_experimental += 1
	if max_authors < record["number_of_authors"]:
	max_authors = record["number_of_authors"]
	if isinstance(record["title"], list):
	max_title = record["title"][0]["title"]
	if isinstance(record["title"], dict):
	max_title = record["title"]["title"]
	if record['experiment'] != None:
	max_experiment = record["experiment"]
	elif record['accelerator_experiment'] != None:
	if isinstance(record["accelerator_experiment"], list):
	for experiment in record["accelerator_experiment"]:
	experiment_name = "None"
	if 'experiment' in experiment:
	experiment_name = experiment["experiment"]
	max_experiment += experiment_name + ","
	elif isinstance(record["accelerator_experiment"], dict):
	experiment_name = "None"
	if "experiment" in record["accelerator_experiment"]:
	experiment_name = record["accelerator_experiment"]["experiment"]
	max_experiment = experiment_name

	if max_experiment == "None":
	max_experiment = record["authors"][0]["full_name"]
	#print(count_experimental)
	#print(max_authors)

	this_data = pd.DataFrame.from_dict({'year': [year], 'nexperimental': [count_experimental], 'max_authors': [max_authors], 'max_title': [max_title], 'max_experiment': [max_experiment]})

	store = pd.HDFStore(output_file)
	store['main_data'] = this_data
	store.close()

	main_data = pd.concat([main_data, this_data])


	def timeplot(x,y,xlabel,ylabel,xmin,xmax,xsteps,ymin,ymax,plotname="plot.pdf"):
	# Generate representations of the data
	fig, axes = plt.subplots()
	plt.plot(x,y,linewidth=7.0)
	plt.xlabel(xlabel)
	plt.ylabel(ylabel)
	axes.set_xlim(xmin,xmax)
	axes.set_ylim(ymin,ymax)
	plt.xticks(np.arange(xmin,xmax,xsteps))
	axes.grid()
	plt.tight_layout()
	plt.savefig(plotname)
	plt.close()



	if __name__ == '__main__':

	DEFAULT_START=0
	DEFAULT_STOP=0

	years=np.arange(DEFAULT_START,DEFAULT_STOP+1,1)

	try:
	years=np.arange(int(sys.argv[1]),int(sys.argv[2])+1,1)
	DEFAULT_START=int(sys.argv[1])
	DEFAULT_STOP=int(sys.argv[2])
	except:
	print(f"Using default range: {DEFAULT_START}-{DEFAULT_STOP}")

	if DEFAULT_START >= 1935 and DEFAULT_STOP <= 2050:
	for year in years:
	print(f"Retrieving data for year {year}")
	get_records(str(year))

	# fix the indices from stitching together so many DataFrames
	main_data_indices = np.arange(
	0, main_data["year"].count())
	main_data.index = main_data_indices

	print(main_data)
	else:
	# Only do plotting if specific years have not been passed
	for year in np.arange(1935, 2020, 1):
	print(f"Retrieving data for year {year}")
	get_records(str(year))

	# Make some graphics!
	timeplot(x=pd.to_numeric(main_data.year,downcast='integer'),
	y=pd.to_numeric(main_data.max_authors,downcast='integer'),
	xlabel="Calendar Year",
	ylabel="Maximum Number of Authors",
	xmin=DEFAULT_START,
	xmax=DEFAULT_STOP,
	xsteps=10,
	ymin=0,
	ymax=6000,
	plotname="maxauthors_vs_year.pdf")

	timeplot(x=pd.to_numeric(main_data.year,downcast='integer'),
	y=pd.to_numeric(main_data.max_authors,downcast='integer'),
	xlabel="Calendar Year",
	ylabel="Maximum Number of Authors",
	xmin=1935,
	xmax=1965,
	xsteps=5,
	ymin=0,
	ymax=60,
	plotname="maxauthors_vs_year_1935_1965.pdf")


	timeplot(x=pd.to_numeric(main_data.year,downcast='integer'),
	y=pd.to_numeric(main_data.nexperimental,downcast='integer'),
	xlabel="Calendar Year",
	ylabel="Total Published Experimental Results",
	xmin=DEFAULT_START,
	xmax=DEFAULT_STOP,
	xsteps=10,
	ymin=0,
	ymax=60000,
	plotname="nexperimental_vs_year.pdf")

	timeplot(x=pd.to_numeric(main_data.year,downcast='integer'),
	y=pd.to_numeric(main_data.nexperimental,downcast='integer'),
	xlabel="Calendar Year",
	ylabel="Total Published Experimental Results",
	xmin=1935,
	xmax=1965,
	xsteps=10,
	ymin=0,
	ymax=2000,
	plotname="nexperimental_vs_year_1935_1965.pdf")