miketahani/scrape-complex-systems.py

## scrape-complex-systems.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# scrape complex systems article pdfs
# requires bs4: `pip install beautifulsoup4`
import os
import re
import json
from urllib import urlopen as get, urlretrieve as save
from bs4 import BeautifulSoup as bs

SKIP_EXISTING = True
ARCHIVE_DIR = 'archives'
if not os.path.exists(ARCHIVE_DIR):
    os.makedirs(ARCHIVE_DIR)

# get links to issue details pages from archive
archive_url = 'http://www.complex-systems.com/archives.html'
# get pdf links from issues page
# http://www.complex-systems.com/issues/06-1.html
base_url = 'http://www.complex-systems.com'

index_html = get(archive_url).read()
index = bs(index_html)

issue_data_finder = re.compile('Vol\.\s(\d+),\sNo\.\s(\d+)', re.I|re.DOTALL)
issues = []
for issue in index.find_all('a', attrs={'href': re.compile('\/issues\/[\d-]+\.html')}):
    vol, num = re.match(issue_data_finder, issue['title']).groups()
    issues.append({
        'vol': vol,
        'no': num,
        'href': base_url + issue['href'],
        'title': issue['title'],
        'img': base_url + issue.find('img')['src'],
        'articles': []
    })

for iss in issues:
    # FIXME gets issue page even when we already have all the articles for that issue
    issue = bs(get(iss['href']).read())
    for article in issue.find_all('h3', attrs={'class': 'absTitle'}):
        pdf = article.find('a', attrs={'class': 'pdfLink'})
        if not pdf:
            # most recent issue has links commented out (which breaks bs4), and the links 404
            continue
        pdf_link_stub = pdf['href']
        pdf_link = base_url + pdf_link_stub

        local_filename = ARCHIVE_DIR + pdf_link_stub.replace('/pdf/', '/')

        if SKIP_EXISTING and os.path.exists(local_filename):
            print u' 💯  file already exists! skipping %s -> %s' % (pdf_link, local_filename)
        else:
            # FIXME should just get the metadata from the metadata.json file
            save(pdf_link, filename=local_filename)
            local_size_kb = os.path.getsize(local_filename)/1024.0
            print u' ✨  saved pdf (%s -> %s) %0.1fkb' % (pdf_link, local_filename, local_size_kb)

        # note: this comes BEFORE the pdf link above (first child of `article`)
        details = article.next
        abstract_link = base_url + details['href']
        title = details.text

        author, authors = article.find_next_sibling('p'), []
        if author:
            authors = [text.strip() for text in author.find_all(text=True)]

        iss['articles'].append({
            'title': title,
            'abstract_link': abstract_link,
            'authors': authors,
            'pdf_link': pdf_link,
            'local_filename': local_filename
        })

with open('metadata.json', 'w') as metadata:
    metadata.write(json.dumps(issues, indent=2))
    print u' 🎀  wrote metadata file'

print u' 👍  done! have a nice day!'
	#!/usr/bin/env python
	# -- coding: utf-8 --
	# scrape complex systems article pdfs
	# requires bs4: `pip install beautifulsoup4`
	import os
	import re
	import json
	from urllib import urlopen as get, urlretrieve as save
	from bs4 import BeautifulSoup as bs

	SKIP_EXISTING = True
	ARCHIVE_DIR = 'archives'
	if not os.path.exists(ARCHIVE_DIR):
	os.makedirs(ARCHIVE_DIR)

	# get links to issue details pages from archive
	archive_url = 'http://www.complex-systems.com/archives.html'
	# get pdf links from issues page
	# http://www.complex-systems.com/issues/06-1.html
	base_url = 'http://www.complex-systems.com'

	index_html = get(archive_url).read()
	index = bs(index_html)

	issue_data_finder = re.compile('Vol\.\s(\d+),\sNo\.\s(\d+)', re.I\|re.DOTALL)
	issues = []
	for issue in index.find_all('a', attrs={'href': re.compile('\/issues\/[\d-]+\.html')}):
	vol, num = re.match(issue_data_finder, issue['title']).groups()
	issues.append({
	'vol': vol,
	'no': num,
	'href': base_url + issue['href'],
	'title': issue['title'],
	'img': base_url + issue.find('img')['src'],
	'articles': []
	})

	for iss in issues:
	# FIXME gets issue page even when we already have all the articles for that issue
	issue = bs(get(iss['href']).read())
	for article in issue.find_all('h3', attrs={'class': 'absTitle'}):
	pdf = article.find('a', attrs={'class': 'pdfLink'})
	if not pdf:
	# most recent issue has links commented out (which breaks bs4), and the links 404
	continue
	pdf_link_stub = pdf['href']
	pdf_link = base_url + pdf_link_stub

	local_filename = ARCHIVE_DIR + pdf_link_stub.replace('/pdf/', '/')

	if SKIP_EXISTING and os.path.exists(local_filename):
	print u' 💯 file already exists! skipping %s -> %s' % (pdf_link, local_filename)
	else:
	# FIXME should just get the metadata from the metadata.json file
	save(pdf_link, filename=local_filename)
	local_size_kb = os.path.getsize(local_filename)/1024.0
	print u' ✨ saved pdf (%s -> %s) %0.1fkb' % (pdf_link, local_filename, local_size_kb)

	# note: this comes BEFORE the pdf link above (first child of `article`)
	details = article.next
	abstract_link = base_url + details['href']
	title = details.text

	author, authors = article.find_next_sibling('p'), []
	if author:
	authors = [text.strip() for text in author.find_all(text=True)]

	iss['articles'].append({
	'title': title,
	'abstract_link': abstract_link,
	'authors': authors,
	'pdf_link': pdf_link,
	'local_filename': local_filename
	})

	with open('metadata.json', 'w') as metadata:
	metadata.write(json.dumps(issues, indent=2))
	print u' 🎀 wrote metadata file'

	print u' 👍 done! have a nice day!'