Skip to content

Instantly share code, notes, and snippets.

@stephensekula
Last active July 17, 2019 00:27
Show Gist options
  • Save stephensekula/d06c7f5d0e33433bf980af00d1f41291 to your computer and use it in GitHub Desktop.
Save stephensekula/d06c7f5d0e33433bf980af00d1f41291 to your computer and use it in GitHub Desktop.
A script for retrieving "big science" data from INSPIREHEP.net. Created to harvest data for the SMU-in-Taos Cultural Institute, 2019, for the course "The Secret City: Los Alamos and the Atomic Age"
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Name: HEPAuthorStudy
# Description: A script for retrieving "big science" data from INSPIREHEP.net.
# Created to harvest data for the SMU-in-Taos Cultural Institute,
# 2019, for the course "The Secret City: Los Alamos and the
# Atomic Age". The idea was to study the number of accelerator-
# based experiments, and the size of author lists from those
# experiments, as a function of time from before WWII to well
# after (present day)
# Copyright (C) 2019 Stephen Jacob Sekula.
#
#
"""A script to use the INSPIRE API to study collaboration size."""
import urllib3
#import request, urlopen, URLError
from bs4 import BeautifulSoup, Comment
import re
import os
import sys
import time
import math
import json
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
# Dataframe
global main_data
main_data = pd.DataFrame(columns=['year', 'nexperimental', 'max_authors', 'max_title', 'max_experiment'])
def search_inspire(url):
"""Do an API search at INSPIRE."""
http = urllib3.PoolManager()
try:
response = http.request('GET', url)
return response.data
except urllib3.exceptions.RequestError as error:
print('URL =', url)
print('No result. Got an error code:', error)
quit()
def get_records(year='1974', threshold='1'):
"""
Year-by-year query. Save data in file on disk named by year. Load
that file if it already exists.
INSPIREHEP's API returns up to 250 records at a time. If a query yields
more, you have to query again up through the total number of groups of
250 records. This function does that. It first checks if the number of
records for a query is 250+. If so, it begins a cycle of queries
starting from the beginning record of the next group, in batches of 250.
Results are stored in the main_data DataFrame. Restrictions are made
on the results so that only accelerator-based experiments are considered.
Also, only public records in journals are considered.
"""
global main_data
output_file = f'{year}.h5'
if os.path.isfile(output_file):
store = pd.HDFStore(output_file)
main_data = pd.concat([main_data,store['main_data']])
store.close()
else:
url=f'http://inspirehep.net/search?of=xm&rg=1&ot=001&p=find+date+=+{year}+and+ac+{threshold}%2b'
result = BeautifulSoup(search_inspire(url), "lxml")
comments = result.findAll(text=lambda text:isinstance(text, Comment))
number_of_records = int(re.sub(r'\D', '', comments[0]))
print(f" number of records: {number_of_records}")
results_groups = math.ceil(number_of_records/250)
all_results = []
for group in np.arange(results_groups):
print(f" processing group {group}/{results_groups}")
jrec = 250*group + 1
url=f'http://inspirehep.net/search?of=recjson&rg=250&jrec={jrec}&p=find+date+=+{year}+and+ac+{threshold}%2b&ot=recid,accelerator_experiment,experiment,publication_info,number_of_authors,authors[0],title'
result = json.loads(search_inspire(url))
all_results = all_results + result
# count experimental results only
count_experimental = 0
# get max number of authors in year
max_authors = 0
max_experiment = "None"
max_title = "None"
print(f" --> number of records available for analysis: {len(all_results)}")
for record in all_results:
if record['publication_info'] != None and (record['experiment'] != "None" or record['accelerator_experiment'] != None):
count_experimental += 1
if max_authors < record["number_of_authors"]:
max_authors = record["number_of_authors"]
if isinstance(record["title"], list):
max_title = record["title"][0]["title"]
if isinstance(record["title"], dict):
max_title = record["title"]["title"]
if record['experiment'] != None:
max_experiment = record["experiment"]
elif record['accelerator_experiment'] != None:
if isinstance(record["accelerator_experiment"], list):
for experiment in record["accelerator_experiment"]:
experiment_name = "None"
if 'experiment' in experiment:
experiment_name = experiment["experiment"]
max_experiment += experiment_name + ","
elif isinstance(record["accelerator_experiment"], dict):
experiment_name = "None"
if "experiment" in record["accelerator_experiment"]:
experiment_name = record["accelerator_experiment"]["experiment"]
max_experiment = experiment_name
if max_experiment == "None":
max_experiment = record["authors"][0]["full_name"]
#print(count_experimental)
#print(max_authors)
this_data = pd.DataFrame.from_dict({'year': [year], 'nexperimental': [count_experimental], 'max_authors': [max_authors], 'max_title': [max_title], 'max_experiment': [max_experiment]})
store = pd.HDFStore(output_file)
store['main_data'] = this_data
store.close()
main_data = pd.concat([main_data, this_data])
def timeplot(x,y,xlabel,ylabel,xmin,xmax,xsteps,ymin,ymax,plotname="plot.pdf"):
# Generate representations of the data
fig, axes = plt.subplots()
plt.plot(x,y,linewidth=7.0)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
axes.set_xlim(xmin,xmax)
axes.set_ylim(ymin,ymax)
plt.xticks(np.arange(xmin,xmax,xsteps))
axes.grid()
plt.tight_layout()
plt.savefig(plotname)
plt.close()
if __name__ == '__main__':
DEFAULT_START=0
DEFAULT_STOP=0
years=np.arange(DEFAULT_START,DEFAULT_STOP+1,1)
try:
years=np.arange(int(sys.argv[1]),int(sys.argv[2])+1,1)
DEFAULT_START=int(sys.argv[1])
DEFAULT_STOP=int(sys.argv[2])
except:
print(f"Using default range: {DEFAULT_START}-{DEFAULT_STOP}")
if DEFAULT_START >= 1935 and DEFAULT_STOP <= 2050:
for year in years:
print(f"Retrieving data for year {year}")
get_records(str(year))
# fix the indices from stitching together so many DataFrames
main_data_indices = np.arange(
0, main_data["year"].count())
main_data.index = main_data_indices
print(main_data)
else:
# Only do plotting if specific years have not been passed
for year in np.arange(1935, 2020, 1):
print(f"Retrieving data for year {year}")
get_records(str(year))
# Make some graphics!
timeplot(x=pd.to_numeric(main_data.year,downcast='integer'),
y=pd.to_numeric(main_data.max_authors,downcast='integer'),
xlabel="Calendar Year",
ylabel="Maximum Number of Authors",
xmin=DEFAULT_START,
xmax=DEFAULT_STOP,
xsteps=10,
ymin=0,
ymax=6000,
plotname="maxauthors_vs_year.pdf")
timeplot(x=pd.to_numeric(main_data.year,downcast='integer'),
y=pd.to_numeric(main_data.max_authors,downcast='integer'),
xlabel="Calendar Year",
ylabel="Maximum Number of Authors",
xmin=1935,
xmax=1965,
xsteps=5,
ymin=0,
ymax=60,
plotname="maxauthors_vs_year_1935_1965.pdf")
timeplot(x=pd.to_numeric(main_data.year,downcast='integer'),
y=pd.to_numeric(main_data.nexperimental,downcast='integer'),
xlabel="Calendar Year",
ylabel="Total Published Experimental Results",
xmin=DEFAULT_START,
xmax=DEFAULT_STOP,
xsteps=10,
ymin=0,
ymax=60000,
plotname="nexperimental_vs_year.pdf")
timeplot(x=pd.to_numeric(main_data.year,downcast='integer'),
y=pd.to_numeric(main_data.nexperimental,downcast='integer'),
xlabel="Calendar Year",
ylabel="Total Published Experimental Results",
xmin=1935,
xmax=1965,
xsteps=10,
ymin=0,
ymax=2000,
plotname="nexperimental_vs_year_1935_1965.pdf")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment