Last active
October 31, 2023 04:04
-
-
Save miketahani/473af74d61ebf40f24ddd786a027b112 to your computer and use it in GitHub Desktop.
scrape the Complex Systems archives (http://www.complex-systems.com/archives.html)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# scrape complex systems article pdfs | |
# requires bs4: `pip install beautifulsoup4` | |
import os | |
import re | |
import json | |
from urllib import urlopen as get, urlretrieve as save | |
from bs4 import BeautifulSoup as bs | |
SKIP_EXISTING = True | |
ARCHIVE_DIR = 'archives' | |
if not os.path.exists(ARCHIVE_DIR): | |
os.makedirs(ARCHIVE_DIR) | |
# get links to issue details pages from archive | |
archive_url = 'http://www.complex-systems.com/archives.html' | |
# get pdf links from issues page | |
# http://www.complex-systems.com/issues/06-1.html | |
base_url = 'http://www.complex-systems.com' | |
index_html = get(archive_url).read() | |
index = bs(index_html) | |
issue_data_finder = re.compile('Vol\.\s(\d+),\sNo\.\s(\d+)', re.I|re.DOTALL) | |
issues = [] | |
for issue in index.find_all('a', attrs={'href': re.compile('\/issues\/[\d-]+\.html')}): | |
vol, num = re.match(issue_data_finder, issue['title']).groups() | |
issues.append({ | |
'vol': vol, | |
'no': num, | |
'href': base_url + issue['href'], | |
'title': issue['title'], | |
'img': base_url + issue.find('img')['src'], | |
'articles': [] | |
}) | |
for iss in issues: | |
# FIXME gets issue page even when we already have all the articles for that issue | |
issue = bs(get(iss['href']).read()) | |
for article in issue.find_all('h3', attrs={'class': 'absTitle'}): | |
pdf = article.find('a', attrs={'class': 'pdfLink'}) | |
if not pdf: | |
# most recent issue has links commented out (which breaks bs4), and the links 404 | |
continue | |
pdf_link_stub = pdf['href'] | |
pdf_link = base_url + pdf_link_stub | |
local_filename = ARCHIVE_DIR + pdf_link_stub.replace('/pdf/', '/') | |
if SKIP_EXISTING and os.path.exists(local_filename): | |
print u' 💯 file already exists! skipping %s -> %s' % (pdf_link, local_filename) | |
else: | |
# FIXME should just get the metadata from the metadata.json file | |
save(pdf_link, filename=local_filename) | |
local_size_kb = os.path.getsize(local_filename)/1024.0 | |
print u' ✨ saved pdf (%s -> %s) %0.1fkb' % (pdf_link, local_filename, local_size_kb) | |
# note: this comes BEFORE the pdf link above (first child of `article`) | |
details = article.next | |
abstract_link = base_url + details['href'] | |
title = details.text | |
author, authors = article.find_next_sibling('p'), [] | |
if author: | |
authors = [text.strip() for text in author.find_all(text=True)] | |
iss['articles'].append({ | |
'title': title, | |
'abstract_link': abstract_link, | |
'authors': authors, | |
'pdf_link': pdf_link, | |
'local_filename': local_filename | |
}) | |
with open('metadata.json', 'w') as metadata: | |
metadata.write(json.dumps(issues, indent=2)) | |
print u' 🎀 wrote metadata file' | |
print u' 👍 done! have a nice day!' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment