Skip to content

Instantly share code, notes, and snippets.

@miketahani
Last active October 31, 2023 04:04
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save miketahani/473af74d61ebf40f24ddd786a027b112 to your computer and use it in GitHub Desktop.
Save miketahani/473af74d61ebf40f24ddd786a027b112 to your computer and use it in GitHub Desktop.
scrape the Complex Systems archives (http://www.complex-systems.com/archives.html)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# scrape complex systems article pdfs
# requires bs4: `pip install beautifulsoup4`
import os
import re
import json
from urllib import urlopen as get, urlretrieve as save
from bs4 import BeautifulSoup as bs
SKIP_EXISTING = True
ARCHIVE_DIR = 'archives'
if not os.path.exists(ARCHIVE_DIR):
os.makedirs(ARCHIVE_DIR)
# get links to issue details pages from archive
archive_url = 'http://www.complex-systems.com/archives.html'
# get pdf links from issues page
# http://www.complex-systems.com/issues/06-1.html
base_url = 'http://www.complex-systems.com'
index_html = get(archive_url).read()
index = bs(index_html)
issue_data_finder = re.compile('Vol\.\s(\d+),\sNo\.\s(\d+)', re.I|re.DOTALL)
issues = []
for issue in index.find_all('a', attrs={'href': re.compile('\/issues\/[\d-]+\.html')}):
vol, num = re.match(issue_data_finder, issue['title']).groups()
issues.append({
'vol': vol,
'no': num,
'href': base_url + issue['href'],
'title': issue['title'],
'img': base_url + issue.find('img')['src'],
'articles': []
})
for iss in issues:
# FIXME gets issue page even when we already have all the articles for that issue
issue = bs(get(iss['href']).read())
for article in issue.find_all('h3', attrs={'class': 'absTitle'}):
pdf = article.find('a', attrs={'class': 'pdfLink'})
if not pdf:
# most recent issue has links commented out (which breaks bs4), and the links 404
continue
pdf_link_stub = pdf['href']
pdf_link = base_url + pdf_link_stub
local_filename = ARCHIVE_DIR + pdf_link_stub.replace('/pdf/', '/')
if SKIP_EXISTING and os.path.exists(local_filename):
print u' 💯 file already exists! skipping %s -> %s' % (pdf_link, local_filename)
else:
# FIXME should just get the metadata from the metadata.json file
save(pdf_link, filename=local_filename)
local_size_kb = os.path.getsize(local_filename)/1024.0
print u' ✨ saved pdf (%s -> %s) %0.1fkb' % (pdf_link, local_filename, local_size_kb)
# note: this comes BEFORE the pdf link above (first child of `article`)
details = article.next
abstract_link = base_url + details['href']
title = details.text
author, authors = article.find_next_sibling('p'), []
if author:
authors = [text.strip() for text in author.find_all(text=True)]
iss['articles'].append({
'title': title,
'abstract_link': abstract_link,
'authors': authors,
'pdf_link': pdf_link,
'local_filename': local_filename
})
with open('metadata.json', 'w') as metadata:
metadata.write(json.dumps(issues, indent=2))
print u' 🎀 wrote metadata file'
print u' 👍 done! have a nice day!'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment