Skip to content

Instantly share code, notes, and snippets.

@nickrsan
Created October 18, 2018 04:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nickrsan/487fa512050f8eb66b7cccd6dc5624da to your computer and use it in GitHub Desktop.
Save nickrsan/487fa512050f8eb66b7cccd6dc5624da to your computer and use it in GitHub Desktop.
A script to pull papers from the CrossRef API (using package Habanero) and do frequency analysis on titles, authors, and institutions.
import os
from time import sleep
from csv import DictWriter, writer
import re
from habanero import Crossref # CrossRef API access
HABANERO_USERNAME = "" # provide an email address so they can contact you if your script misbehaves
ISSN = "" # ISSN of the journal to dump data for
BASE_FOLDER = os.path.dirname(os.path.abspath(__file__))
OUTPUT_FILE = os.path.join(BASE_FOLDER, "{}.csv".format(ISSN)) # dumps out a CSV with the ISSN as its name in the same directory
TITLE_FREQUENCY_FILE = os.path.join(BASE_FOLDER, "{}_title_frequency.csv".format(ISSN)) # dumps out a CSV with the ISSN as its name in the same directory
AUTHOR_FREQUENCY_FILE = os.path.join(BASE_FOLDER, "{}_author_frequency.csv".format(ISSN)) # dumps out a CSV with the ISSN as its name in the same directory
INSTITUTION_FREQUENCY_FILE = os.path.join(BASE_FOLDER, "{}_insitution_frequency.csv".format(ISSN)) # dumps out a CSV with the ISSN as its name in the same directory
SLEEP_TIME = 0.05 # sleep for 50 ms between requests to be nice to the CrossRef API
PER_PAGE = 1000
KEYS_TO_KEEP = [u'DOI', u'reference', u'issued', u'prefix', u'relation', u'author', u'reference-count', u'ISSN', u'member', u'source', u'score', u'deposited', u'indexed', u'type', u'published-online', u'URL', u'is-referenced-by-count', u'volume', u'issn-type', u'link', u'published-print', u'journal-issue', u'references-count', u'short-container-title', u'publisher', u'content-domain', u'language', u'license', u'created', u'issue', u'title', u'alternative-id', u'container-title', u'page']
def make_data_safe(paper, keys=KEYS_TO_KEEP):
"""
Originally handled converting to string, but abandoned that for now so we can do some other analysis below.
Now just saves the parts we actually want to keep, does nothing else. Could be skippable, but the DictWriter
might complain
"""
output_dict = {}
for key in keys:
if key in paper:
output_dict[key] = paper[key]
return output_dict
def _combine_dict_to_list(frequencies):
item_frequencies = [] # make a list instead of a dict
for item in frequencies:
item_frequencies.append([item, frequencies[item]]) # make it a list of lists so we can write it out with a listwriter
return item_frequencies
def frequency_titles(papers):
print("Getting frequency of words in titles")
words = {}
for paper in papers:
title = paper['title'][0].encode('utf-8')
title_words = re.findall("\w+", title)
for word in title_words:
match_word = word.lower()
if len(match_word) > 2:
if match_word not in words:
words[match_word] = 1 # initialize it if it's not there yet
else:
words[match_word] += 1 # otherwise increment its frequency
return _combine_dict_to_list(words)
def frequency_institutions(papers):
print("Getting frequency of institutions")
institutions = {}
for paper in papers:
if not "author" in paper:
continue
for author in paper["author"]:
for affiliation in author["affiliation"]:
affiliation_lower = affiliation['name'].lower().encode('utf-8')
affiliation_parts = affiliation_lower.split(",")
for part in affiliation_parts: # try to figure out what their actual university is, not their institute, school, center, department, etc
if "university" in part:
affiliation_lower = part
if affiliation_lower.startswith(" "):
affiliation_lower = affiliation_lower.replace(" ", "", 1) # if it starts with a space, remove the first space
if affiliation_lower not in institutions:
institutions[affiliation_lower] = 1
else:
institutions[affiliation_lower] += 1
return _combine_dict_to_list(institutions)
def frequency_authors(papers):
print("Getting frequency of authors")
authors = {}
for paper in papers:
if not "author" in paper:
continue
for author in paper['author']:
if "given" in author and "family" in author:
author_combined = u"{}{}".format(author[u'given'], author[u'family'])
elif "given" in author:
author_combined = author["given"]
elif "family" in author:
author_combined = author["family"]
else:
author_combined = ""
author_combined = author_combined.encode('utf-8').replace(" ", "")
if paper['title'][0].encode('utf-8').lower().startswith("book review"):
author_combined +="_book_review" # call these out separately so we know who is publishing and who is reviewing
if author_combined not in authors:
authors[author_combined] = 1
else:
authors[author_combined] += 1
return _combine_dict_to_list(authors)
def write_frequencies(items):
for item in items:
print("Writing Frequency Info for {}".format(item["name"]))
with open(item["path"], 'wb') as output_file_handle:
csv_writer = writer(output_file_handle)
csv_writer.writerow(["Word", "Frequency"])
csv_writer.writerows(item["data"])
def get_papers(issn=ISSN, offset=0, per_page=PER_PAGE):
crossref_api = Crossref(mailto=HABANERO_USERNAME)
return crossref_api.works(filter={"issn": issn}, offset=offset, limit=per_page) # get a first set of papers
def get_paper_info():
num_papers = 0
collected_info = 0
paper_info = get_papers(ISSN, collected_info, PER_PAGE)
num_papers = paper_info['message'][u'total-results']
print("Found {} papers".format(num_papers))
papers = []
while collected_info < num_papers:
collected_info += PER_PAGE
print("Collecting {} papers".format(collected_info))
for paper in paper_info['message']['items']:
if 'title' in paper: # if it has a title in the data, we'll keep it
papers.append(make_data_safe(paper))
sleep(1)
paper_info = get_papers(ISSN, collected_info, PER_PAGE) # get the next page
return papers
def write_derived_products(papers):
title_frequency_info = frequency_titles(papers)
author_frequency_info = frequency_authors(papers)
institution_frequency_info = frequency_institutions(papers)
print("Writing Paper Info")
with open(OUTPUT_FILE, 'wb') as output_file_handle:
csv_writer = DictWriter(output_file_handle, fieldnames=KEYS_TO_KEEP)
csv_writer.writeheader()
csv_writer.writerows(papers)
write_frequencies([{"name":"Title", "path":TITLE_FREQUENCY_FILE, "data": title_frequency_info},
{"name": "Author", "path":AUTHOR_FREQUENCY_FILE, "data": author_frequency_info},
{"name": "Institution", "path": INSTITUTION_FREQUENCY_FILE, "data": institution_frequency_info}])
if __name__ == "__main__":
papers = get_paper_info()
write_derived_products(papers)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment