Skip to content

Instantly share code, notes, and snippets.

@dwf
Created February 1, 2010 21:01
Show Gist options
  • Save dwf/292027 to your computer and use it in GitHub Desktop.
Save dwf/292027 to your computer and use it in GitHub Desktop.
An old screen scraping RSS feed generation script for a certain webcomic.
"""
tfd.py -- Scrapes a certain set of comics' websites, and spits
out home-grown RSS feeds for them.
This is pretty useless now that all of the Sharing Machine comics
provide RSS feeds, but might serve as an okay example of old-school
screen scraping and XML generation in Python. I hear the cool kids
use BeautifulSoup (for scraping) and lxml (for XML generation)
nowadays.
By David Warde-Farley -- user AT cs dot toronto dot edu (user = dwf)
Redistributable under the terms of the 3-clause BSD license
(see http://www.opensource.org/licenses/bsd-license.php for details)
"""
import sgmllib
import re
import urllib
import sys
import xml.dom.minidom as minidom
import time, datetime
DEBUG = False
"""Mappings from the short acronyms to the main part of the domains."""
longnames = {
'mtts' : 'marriedtothesea',
'tfd' : 'toothpastefordinner',
'nd' : 'nataliedee'
}
"""Months of the year. You'd think there's be a simple built-in way?"""
months = ['jan','feb','mar','apr','may','jun',
'jul','aug','sep','oct','nov','dec']
class ToothpasteScraper(sgmllib.SGMLParser):
"""
Scrapes a certain comic's main website (as of 03/13/08) to grab a list
of recent comics for further processing. A list of tuples is available
in the instance variable 'comic_list' after processing.
Essentially works from a 'Visitor' pattern with hooks provided by
SGMLParser, we just look for the links that are actual links to comics
(or the title) and maintain some state to process stuff between those
tags only when relevant (start_a and start_title set flags that
let the next handle_data call know that he's up at bat for something.)
"""
def __init__(self, comic=None):
"""
Instantiate a ToothpasteScraper. If an URL is given, the URL is
opened with urllib and feed() is called immediately with the
resulting string.
"""
# Empty comic list.
self.comic_list = []
# Title - uhhhh.
self.comic_title = "(No comic title)"
# Boolean state flags so that we know when to care about textual data
self.parsing_comic_link = False
self.parsing_title = False
# Regular expression we'd like to match against.
self.URL_PATTERN = \
r"http://www.%s.com/[01]\d{5}/([a-zA-Z_]|\-)+\.(gif|jpg)" % \
(longnames[comic],)
# Instantiate superclass.
sgmllib.SGMLParser.__init__(self)
self._go(comic)
def _go(self,comic):
"""Starts the parser to work with data fed from an URL connection."""
today = datetime.datetime.today()
month = months[today.month - 1]
year = str(today.year)[-2:]
url = 'http://www.%s.com/%s-archives/%sarchive-%s%s.php' % \
(longnames[comic], comic, comic, month, year)
if DEBUG:
print >> sys.stderr, "Fetching " + url
handle = urllib.urlopen(url)
data = handle.read()
handle.close()
self.feed(data)
self.close()
def start_a(self,attributes):
"""
Called when an opening anchor tag is encountered. Basically regex
matches HREF for comic links and scrapes it on match.
"""
atts = dict(attributes)
if 'href' in atts and re.match(self.URL_PATTERN, atts['href']):
self.parsing_comic_link = True
if DEBUG:
print >>sys.stderr, atts['href'].split("/")
raw_d = atts['href'].split("/")[3]
self.cur_date = "%s/%s/%s" % (raw_d[0:2],raw_d[2:4],raw_d[4:6])
self.cur_link = atts['href']
def handle_data(self, data):
"""
Handles arbitrary data, but only actually does anything if we're
inside a comic link or a title tag.
"""
if self.parsing_comic_link:
self.cur_text = data.strip() # Not sure if the strip is necessary.
elif self.parsing_title:
self.comic_title = "".join(data.strip().split("\n"))
def end_a(self):
"""
Called when an ending anchor tag is encountered. If a link is being
read, it's finalized and added to the list. Otherwise nothing
happens.
"""
if self.parsing_comic_link:
self.comic_list.append((self.cur_link,self.cur_date,self.cur_text))
del self.cur_link, self.cur_date, self.cur_text
# Done outside the if for robustness or something.
self.parsing_comic_link = False
def start_title(self, attributes):
"""Used to scrape the title of the comic from the TITLE tag."""
self.parsing_title = True
def end_title(self):
"""Used to scrape the title of the comic from the TITLE tag."""
self.parsing_title = False
class RSSMaker:
def __init__(self,url,title,comics,desc=None):
"""
Instantiates an RSSMaker, which basically needs a title
and a list of comics to create itself a DOM tree. A title
is also kinda nice.
"""
self.date_format = "%a, %d %b %Y %H:%M:%S %Z"
if url[-1] != "/":
url += "/"
self.url = url
impl = minidom.getDOMImplementation()
self.xmldocument = impl.createDocument(None, "rss", None)
self.xmldocument.documentElement.setAttribute("version","2.0")
self.chan_tag = self.xmldocument.createElement("channel")
self.xmldocument.documentElement.appendChild(self.chan_tag)
titletag = self.xmldocument.createElement("title")
if not title:
title = "Scraped RSS Feed of " + url
titletag.appendChild(self._mktxt(title))
self.chan_tag.appendChild(titletag)
linktag = self.xmldocument.createElement("link")
linktag.appendChild(self._mktxt(url))
self.chan_tag.appendChild(linktag)
desctag = self.xmldocument.createElement("description")
if not desc:
desc = "No description provided."
desctag.appendChild(self._mktxt(desc))
self.chan_tag.appendChild(desctag)
lastbuilddatetag = self.xmldocument.createElement("pubDate")
date = time.strftime(self.date_format)
lastbuilddatetag.appendChild(self._mktxt(date))
self.chan_tag.appendChild(lastbuilddatetag)
if comics:
self._load_comics(comics)
def _mktxt(self, text):
"""Convenience method for creating text nodes."""
return self.xmldocument.createTextNode(text)
def _load_comics(self, comics):
"""
Private method called by the constructor that
actually loads in the comics and creates the relevant
DOM elements.
"""
for comic in comics:
comic_url = comic[0]
comic_item = self.xmldocument.createElement("item")
date_nums = [int(x) for x in comic[1].split("/")]
date_nums[-1] += 2000
dt = datetime.datetime(date_nums[2],date_nums[0],date_nums[1])
tup = dt.timetuple()[:-1] + (time.daylight,)
datestr = time.strftime(self.date_format, dt.timetuple())
comic_date = self.xmldocument.createElement("pubDate")
comic_date.appendChild(self._mktxt(datestr))
comic_title = self.xmldocument.createElement("title")
comic_title.appendChild(self._mktxt(comic[2]))
comic_link = self.xmldocument.createElement("link")
comic_link.appendChild(self._mktxt(comic[0]))
comic_desc = self.xmldocument.createElement("description")
desc = "<img src=\"%s\" alt=\"%s\" />"% (comic[0],comic[2])
comic_desc.appendChild(self._mktxt(desc))
comic_item.appendChild(comic_title)
comic_item.appendChild(comic_link)
comic_item.appendChild(comic_desc)
comic_item.appendChild(comic_date)
comic_guid = self.xmldocument.createElement("guid")
comic_guid.appendChild(self._mktxt(comic_url))
comic_item.appendChild(comic_guid)
self.chan_tag.appendChild(comic_item)
def getfeed(self):
"""
Retrieve the XML document in all of it's glory.
presumably if the constructor didn't bubble up an exception
this method should give you a valid RSS feed.
"""
return self.xmldocument.toprettyxml()
def usage(argv):
print >> sys.stderr, "usage: %s -c {mtts,tfd,nd}" % (argv[0])
if __name__ == "__main__":
import getopt
opts, args = getopt.getopt(sys.argv[1:],'c:')
try:
d = dict(opts)
comic = d['-c']
except:
usage(sys.argv)
sys.exit(1)
scraper = ToothpasteScraper(comic=comic)
f = RSSMaker("http://www.%s.com/" % (longnames[comic]), \
title=scraper.comic_title, comics=scraper.comic_list)
print f.getfeed()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment