Skip to content

Instantly share code, notes, and snippets.

@coderjo
Created November 6, 2011 09:58
Show Gist options
  • Save coderjo/1342660 to your computer and use it in GitHub Desktop.
Save coderjo/1342660 to your computer and use it in GitHub Desktop.
simple scientific american pdf url builder
#!/usr/bin/env python
from HTMLParser import HTMLParser
import urllib2, re
pdfre = re.compile(r"^pdf/1[89][0-9][0-9]-[0-9]{2}-[0-9]{2}\.pdf$")
journalpage = re.compile(r"^/scientificamerican/journal/v([1-9]|[1-9][0-9]|10[01])/")
class SAParser(HTMLParser):
def __init__(self, linkcallback):
HTMLParser.__init__(self)
self.newlink = linkcallback
def handle_starttag(self, tag, attrs):
if tag.lower() == "a":
for attr in attrs:
if attr[0].lower() == "href":
self.newlink(attr[1])
issues = list()
def issuelink(href):
if journalpage.match(href):
issues.append(href)
archivepage = urllib2.urlopen("http://www.nature.com/scientificamerican/archive/index_1909.html")
parser = SAParser(issuelink)
parser.feed(archivepage.read())
parser.close()
archivepage.close()
baseurl = ''
def pdflink(href):
if pdfre.match(href):
print "http://www.nature.com%s/%s" % (baseurl, href)
for issue in issues:
baseurl = issue.rpartition('/')[0]
page = urllib2.urlopen("http://www.nature.com%s" % issue)
parser = SAParser(pdflink)
parser.feed(page.read())
parser.close()
page.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment