Instantly share code, notes, and snippets.
Last active
July 17, 2019 00:27
-
Star
(0)
0
You must be signed in to star a gist -
Fork
(0)
0
You must be signed in to fork a gist
-
Save stephensekula/d06c7f5d0e33433bf980af00d1f41291 to your computer and use it in GitHub Desktop.
A script for retrieving "big science" data from INSPIREHEP.net. Created to harvest data for the SMU-in-Taos Cultural Institute, 2019, for the course "The Secret City: Los Alamos and the Atomic Age"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# | |
# Name: HEPAuthorStudy | |
# Description: A script for retrieving "big science" data from INSPIREHEP.net. | |
# Created to harvest data for the SMU-in-Taos Cultural Institute, | |
# 2019, for the course "The Secret City: Los Alamos and the | |
# Atomic Age". The idea was to study the number of accelerator- | |
# based experiments, and the size of author lists from those | |
# experiments, as a function of time from before WWII to well | |
# after (present day) | |
# Copyright (C) 2019 Stephen Jacob Sekula. | |
# | |
# | |
"""A script to use the INSPIRE API to study collaboration size.""" | |
import urllib3 | |
#import request, urlopen, URLError | |
from bs4 import BeautifulSoup, Comment | |
import re | |
import os | |
import sys | |
import time | |
import math | |
import json | |
import pandas as pd | |
import numpy as np | |
import matplotlib | |
matplotlib.use('Agg') | |
import matplotlib.pyplot as plt | |
# Dataframe | |
global main_data | |
main_data = pd.DataFrame(columns=['year', 'nexperimental', 'max_authors', 'max_title', 'max_experiment']) | |
def search_inspire(url): | |
"""Do an API search at INSPIRE.""" | |
http = urllib3.PoolManager() | |
try: | |
response = http.request('GET', url) | |
return response.data | |
except urllib3.exceptions.RequestError as error: | |
print('URL =', url) | |
print('No result. Got an error code:', error) | |
quit() | |
def get_records(year='1974', threshold='1'): | |
""" | |
Year-by-year query. Save data in file on disk named by year. Load | |
that file if it already exists. | |
INSPIREHEP's API returns up to 250 records at a time. If a query yields | |
more, you have to query again up through the total number of groups of | |
250 records. This function does that. It first checks if the number of | |
records for a query is 250+. If so, it begins a cycle of queries | |
starting from the beginning record of the next group, in batches of 250. | |
Results are stored in the main_data DataFrame. Restrictions are made | |
on the results so that only accelerator-based experiments are considered. | |
Also, only public records in journals are considered. | |
""" | |
global main_data | |
output_file = f'{year}.h5' | |
if os.path.isfile(output_file): | |
store = pd.HDFStore(output_file) | |
main_data = pd.concat([main_data,store['main_data']]) | |
store.close() | |
else: | |
url=f'http://inspirehep.net/search?of=xm&rg=1&ot=001&p=find+date+=+{year}+and+ac+{threshold}%2b' | |
result = BeautifulSoup(search_inspire(url), "lxml") | |
comments = result.findAll(text=lambda text:isinstance(text, Comment)) | |
number_of_records = int(re.sub(r'\D', '', comments[0])) | |
print(f" number of records: {number_of_records}") | |
results_groups = math.ceil(number_of_records/250) | |
all_results = [] | |
for group in np.arange(results_groups): | |
print(f" processing group {group}/{results_groups}") | |
jrec = 250*group + 1 | |
url=f'http://inspirehep.net/search?of=recjson&rg=250&jrec={jrec}&p=find+date+=+{year}+and+ac+{threshold}%2b&ot=recid,accelerator_experiment,experiment,publication_info,number_of_authors,authors[0],title' | |
result = json.loads(search_inspire(url)) | |
all_results = all_results + result | |
# count experimental results only | |
count_experimental = 0 | |
# get max number of authors in year | |
max_authors = 0 | |
max_experiment = "None" | |
max_title = "None" | |
print(f" --> number of records available for analysis: {len(all_results)}") | |
for record in all_results: | |
if record['publication_info'] != None and (record['experiment'] != "None" or record['accelerator_experiment'] != None): | |
count_experimental += 1 | |
if max_authors < record["number_of_authors"]: | |
max_authors = record["number_of_authors"] | |
if isinstance(record["title"], list): | |
max_title = record["title"][0]["title"] | |
if isinstance(record["title"], dict): | |
max_title = record["title"]["title"] | |
if record['experiment'] != None: | |
max_experiment = record["experiment"] | |
elif record['accelerator_experiment'] != None: | |
if isinstance(record["accelerator_experiment"], list): | |
for experiment in record["accelerator_experiment"]: | |
experiment_name = "None" | |
if 'experiment' in experiment: | |
experiment_name = experiment["experiment"] | |
max_experiment += experiment_name + "," | |
elif isinstance(record["accelerator_experiment"], dict): | |
experiment_name = "None" | |
if "experiment" in record["accelerator_experiment"]: | |
experiment_name = record["accelerator_experiment"]["experiment"] | |
max_experiment = experiment_name | |
if max_experiment == "None": | |
max_experiment = record["authors"][0]["full_name"] | |
#print(count_experimental) | |
#print(max_authors) | |
this_data = pd.DataFrame.from_dict({'year': [year], 'nexperimental': [count_experimental], 'max_authors': [max_authors], 'max_title': [max_title], 'max_experiment': [max_experiment]}) | |
store = pd.HDFStore(output_file) | |
store['main_data'] = this_data | |
store.close() | |
main_data = pd.concat([main_data, this_data]) | |
def timeplot(x,y,xlabel,ylabel,xmin,xmax,xsteps,ymin,ymax,plotname="plot.pdf"): | |
# Generate representations of the data | |
fig, axes = plt.subplots() | |
plt.plot(x,y,linewidth=7.0) | |
plt.xlabel(xlabel) | |
plt.ylabel(ylabel) | |
axes.set_xlim(xmin,xmax) | |
axes.set_ylim(ymin,ymax) | |
plt.xticks(np.arange(xmin,xmax,xsteps)) | |
axes.grid() | |
plt.tight_layout() | |
plt.savefig(plotname) | |
plt.close() | |
if __name__ == '__main__': | |
DEFAULT_START=0 | |
DEFAULT_STOP=0 | |
years=np.arange(DEFAULT_START,DEFAULT_STOP+1,1) | |
try: | |
years=np.arange(int(sys.argv[1]),int(sys.argv[2])+1,1) | |
DEFAULT_START=int(sys.argv[1]) | |
DEFAULT_STOP=int(sys.argv[2]) | |
except: | |
print(f"Using default range: {DEFAULT_START}-{DEFAULT_STOP}") | |
if DEFAULT_START >= 1935 and DEFAULT_STOP <= 2050: | |
for year in years: | |
print(f"Retrieving data for year {year}") | |
get_records(str(year)) | |
# fix the indices from stitching together so many DataFrames | |
main_data_indices = np.arange( | |
0, main_data["year"].count()) | |
main_data.index = main_data_indices | |
print(main_data) | |
else: | |
# Only do plotting if specific years have not been passed | |
for year in np.arange(1935, 2020, 1): | |
print(f"Retrieving data for year {year}") | |
get_records(str(year)) | |
# Make some graphics! | |
timeplot(x=pd.to_numeric(main_data.year,downcast='integer'), | |
y=pd.to_numeric(main_data.max_authors,downcast='integer'), | |
xlabel="Calendar Year", | |
ylabel="Maximum Number of Authors", | |
xmin=DEFAULT_START, | |
xmax=DEFAULT_STOP, | |
xsteps=10, | |
ymin=0, | |
ymax=6000, | |
plotname="maxauthors_vs_year.pdf") | |
timeplot(x=pd.to_numeric(main_data.year,downcast='integer'), | |
y=pd.to_numeric(main_data.max_authors,downcast='integer'), | |
xlabel="Calendar Year", | |
ylabel="Maximum Number of Authors", | |
xmin=1935, | |
xmax=1965, | |
xsteps=5, | |
ymin=0, | |
ymax=60, | |
plotname="maxauthors_vs_year_1935_1965.pdf") | |
timeplot(x=pd.to_numeric(main_data.year,downcast='integer'), | |
y=pd.to_numeric(main_data.nexperimental,downcast='integer'), | |
xlabel="Calendar Year", | |
ylabel="Total Published Experimental Results", | |
xmin=DEFAULT_START, | |
xmax=DEFAULT_STOP, | |
xsteps=10, | |
ymin=0, | |
ymax=60000, | |
plotname="nexperimental_vs_year.pdf") | |
timeplot(x=pd.to_numeric(main_data.year,downcast='integer'), | |
y=pd.to_numeric(main_data.nexperimental,downcast='integer'), | |
xlabel="Calendar Year", | |
ylabel="Total Published Experimental Results", | |
xmin=1935, | |
xmax=1965, | |
xsteps=10, | |
ymin=0, | |
ymax=2000, | |
plotname="nexperimental_vs_year_1935_1965.pdf") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment