Skip to content

Instantly share code, notes, and snippets.

@BaxterEaves
Last active February 14, 2016 16:24
Show Gist options
  • Save BaxterEaves/6bdafbd5e861477bf329 to your computer and use it in GitHub Desktop.
Save BaxterEaves/6bdafbd5e861477bf329 to your computer and use it in GitHub Desktop.
Scrape abstracts from PNAS.
"""
Copyright (C) 2015 Baxter Eaves
License: Do what the fuck you want to public license (WTFPL) V2
Scrape abstracts from the Proceedings of the National Academy of Sciences.
Requires: beutifulsoup4
"""
import re
import sys
import time
import pickle
import requests
from bs4 import BeautifulSoup
HEADERS = {'Accept-Language': 'en-US,en;q=0.8'}
PNAS_URL = "http://www.pnas.org"
PNAS_CONTENT_URL = PNAS_URL + "/content/by/year"
MAX_REQUEST_RATE = 3.1 # PNAS gets upset if you go any faster
TIMEOUT = 60 # PNAS' website is damned slow
def get_abstract(absurl):
url = PNAS_URL + absurl
sys.stdout.write('_%s...' % (url,))
sys.stdout.flush()
t_start = time.time()
page = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
soup = BeautifulSoup(page.text, 'html.parser')
authors = soup.find_all("meta", {"name": "DC.Contributor"})
issn = soup.find_all("meta", {"name": "citation_issn"})
title = soup.find("meta", {"name": "citation_title"})['content']
abspars = soup.findAll('p', id=re.compile('^p-\d+'))
if len(abspars) == 0:
raise RuntimeError("Failed to find abstract.")
elif len(abspars) == 1:
abspar = abspars[0]
else:
# Assume that the abstract is the paragraph with the most text.
parlens = [len(par.text) for par in abspars]
absidx = max(enumerate(parlens), key=lambda x: x[1])[0]
abspar = abspars[absidx]
if len(abspar.text) < len(title):
raise RuntimeError("Abstract short. Maybe found the wrong thing?")
abstract = re.sub('[ \t\n]+', ' ', abspar[0].text)
data = {
'Title': title,
'Abstract': abstract,
'Authors': "; ".join(a['content'] for a in authors),
'Date': soup.find("meta", {"name": "DC.Date"})['content'],
'Volume': soup.find("meta", {"name": "citation_volume"})['content'],
'Issue': soup.find("meta", {"name": "citation_issue"})['content'],
'ISSN': "; ".join(n['content'] for n in issn)}
t_total = time.time() - t_start
t_diff = MAX_REQUEST_RATE-t_total
sys.stdout.write('(%1.2f sec)%s\n' % (t_total, title,))
sys.stdout.flush()
if t_diff > 0:
time.sleep(t_diff)
return data
def get_issue_abstract_urls(vol, issue):
sys.stdout.write(' Issue {}...'.format(issue+1))
sys.stdout.flush()
url = PNAS_URL + "/content/{}/{}.toc".format(vol, issue+1)
page = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
soup = BeautifulSoup(page.text, 'html.parser')
articles = soup.find_all('a', {'rel': 'abstract'})
retval = []
for absurl in articles:
retval.append([vol, issue, absurl['href']])
sys.stdout.write('done (%d abstracts).\n' % (len(retval),))
return retval
def get_abstract_urls_for_year(year):
sys.stdout.write('Year: {}\n'.format(year))
sys.stdout.flush()
url = "{}/{}".format(PNAS_CONTENT_URL, year)
page = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
vols = re.findall(r'"/content/vol(\d+)', page.text)
vol = vols[0]
n_issues = len(vols)
retval = []
for issue in range(n_issues):
retval.extend(get_issue_abstract_urls(vol, issue))
return retval
def genargs(first_year=1991, last_year=2001):
""" Generate args to scrape all PNAS abstracts from first_year to
last_year.
"""
print("Scraping abstract URLS {}-{}".format(first_year, last_year))
args = []
for year in range(first_year, last_year+1):
args.extend(get_abstract_urls_for_year(year))
argsout = []
for i, arg in enumerate(args):
argsout.append(tuple(arg + [i]))
return argsout
def scrape(args, filename='data/pnas.pkl', start=None):
""" Scrape PNAS database for abstracts.
Implementation notes
--------------------
- Won't stop unless until it's done or until you ctrl+C.
- Saves the data to a list of dicts, that can be easily converted into a
pandas DataFrame.
Parameters
----------
args : list(tuple)
Generated by gen_args()
filename : str
Filename for the output
start : int >= 0 or None
If an int, n, scrape starts at the nth arg and appends new abstracts to
filename. This is used if there is some unhandled error that occurs,
e.g. PNAS kicks you out.
Example
-------
Convert to a pandas DataFrame
>>> import pickle
>>> import pandas
>>> args = genargs(first_year=1999, last_year=2001)
>>> scrape(args, 'pnas.pkl')
>>> data = pickle.load('pnas.pkl', 'rb')
>>> df = pandas.DataFrame(data)
Resume a scrape that failed on the 768th abstract
>>> '+ Exception: 768 failed'
>>> scrape(args, 'pnas.pkl', start=768)
"""
if start is None:
start = 0
data = []
else:
data = pickle.load(open(filename, 'rb'))
for _, _, url, idx in args[start:]:
success = False
while not success:
try:
# save each time we scrape so that we can resume if PNAS kicks
# us out.
data.append(get_abstract(url))
pickle.dump(data, open(filename, 'wb'))
success = True
except TypeError as err:
print("\n+ TypeError (request failure): %d failed" % (idx,))
print("+ Waiting for %d seconds before retry..." % (TIMEOUT,))
time.sleep(TIMEOUT)
except Exception as err:
print("\n+ Exception: %d failed" % (idx,))
raise(err)
return data
if __name__ == "__main__":
args_filename = 'data/pnas91-01_args.pkl'
data_filename = 'data/pnas91-01_data.pkl'
# NOTE: I've found that in general the issue splash pages load
# significantly slower than the abstract pages. We first scrape the splash
# pages for # abstract URLs and then scrap those URLs for abstracts and
# other metadata.
# NOTE: PNAS is pretty finicky about scraping. They only let you make one
# request every 3 seconds. One year has about 2400 abstracts, so each year
# is going to take you two hours, at minimum.
args = genargs(first_year=1991, last_year=2001)
pickle.dump(args, open(args_filename, 'wb'))
args = pickle.load(open(args_filename, 'rb'))
scrape(args, data_filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment