Created
February 1, 2010 21:01
-
-
Save dwf/292027 to your computer and use it in GitHub Desktop.
An old screen scraping RSS feed generation script for a certain webcomic.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
tfd.py -- Scrapes a certain set of comics' websites, and spits | |
out home-grown RSS feeds for them. | |
This is pretty useless now that all of the Sharing Machine comics | |
provide RSS feeds, but might serve as an okay example of old-school | |
screen scraping and XML generation in Python. I hear the cool kids | |
use BeautifulSoup (for scraping) and lxml (for XML generation) | |
nowadays. | |
By David Warde-Farley -- user AT cs dot toronto dot edu (user = dwf) | |
Redistributable under the terms of the 3-clause BSD license | |
(see http://www.opensource.org/licenses/bsd-license.php for details) | |
""" | |
import sgmllib | |
import re | |
import urllib | |
import sys | |
import xml.dom.minidom as minidom | |
import time, datetime | |
DEBUG = False | |
"""Mappings from the short acronyms to the main part of the domains.""" | |
longnames = { | |
'mtts' : 'marriedtothesea', | |
'tfd' : 'toothpastefordinner', | |
'nd' : 'nataliedee' | |
} | |
"""Months of the year. You'd think there's be a simple built-in way?""" | |
months = ['jan','feb','mar','apr','may','jun', | |
'jul','aug','sep','oct','nov','dec'] | |
class ToothpasteScraper(sgmllib.SGMLParser): | |
""" | |
Scrapes a certain comic's main website (as of 03/13/08) to grab a list | |
of recent comics for further processing. A list of tuples is available | |
in the instance variable 'comic_list' after processing. | |
Essentially works from a 'Visitor' pattern with hooks provided by | |
SGMLParser, we just look for the links that are actual links to comics | |
(or the title) and maintain some state to process stuff between those | |
tags only when relevant (start_a and start_title set flags that | |
let the next handle_data call know that he's up at bat for something.) | |
""" | |
def __init__(self, comic=None): | |
""" | |
Instantiate a ToothpasteScraper. If an URL is given, the URL is | |
opened with urllib and feed() is called immediately with the | |
resulting string. | |
""" | |
# Empty comic list. | |
self.comic_list = [] | |
# Title - uhhhh. | |
self.comic_title = "(No comic title)" | |
# Boolean state flags so that we know when to care about textual data | |
self.parsing_comic_link = False | |
self.parsing_title = False | |
# Regular expression we'd like to match against. | |
self.URL_PATTERN = \ | |
r"http://www.%s.com/[01]\d{5}/([a-zA-Z_]|\-)+\.(gif|jpg)" % \ | |
(longnames[comic],) | |
# Instantiate superclass. | |
sgmllib.SGMLParser.__init__(self) | |
self._go(comic) | |
def _go(self,comic): | |
"""Starts the parser to work with data fed from an URL connection.""" | |
today = datetime.datetime.today() | |
month = months[today.month - 1] | |
year = str(today.year)[-2:] | |
url = 'http://www.%s.com/%s-archives/%sarchive-%s%s.php' % \ | |
(longnames[comic], comic, comic, month, year) | |
if DEBUG: | |
print >> sys.stderr, "Fetching " + url | |
handle = urllib.urlopen(url) | |
data = handle.read() | |
handle.close() | |
self.feed(data) | |
self.close() | |
def start_a(self,attributes): | |
""" | |
Called when an opening anchor tag is encountered. Basically regex | |
matches HREF for comic links and scrapes it on match. | |
""" | |
atts = dict(attributes) | |
if 'href' in atts and re.match(self.URL_PATTERN, atts['href']): | |
self.parsing_comic_link = True | |
if DEBUG: | |
print >>sys.stderr, atts['href'].split("/") | |
raw_d = atts['href'].split("/")[3] | |
self.cur_date = "%s/%s/%s" % (raw_d[0:2],raw_d[2:4],raw_d[4:6]) | |
self.cur_link = atts['href'] | |
def handle_data(self, data): | |
""" | |
Handles arbitrary data, but only actually does anything if we're | |
inside a comic link or a title tag. | |
""" | |
if self.parsing_comic_link: | |
self.cur_text = data.strip() # Not sure if the strip is necessary. | |
elif self.parsing_title: | |
self.comic_title = "".join(data.strip().split("\n")) | |
def end_a(self): | |
""" | |
Called when an ending anchor tag is encountered. If a link is being | |
read, it's finalized and added to the list. Otherwise nothing | |
happens. | |
""" | |
if self.parsing_comic_link: | |
self.comic_list.append((self.cur_link,self.cur_date,self.cur_text)) | |
del self.cur_link, self.cur_date, self.cur_text | |
# Done outside the if for robustness or something. | |
self.parsing_comic_link = False | |
def start_title(self, attributes): | |
"""Used to scrape the title of the comic from the TITLE tag.""" | |
self.parsing_title = True | |
def end_title(self): | |
"""Used to scrape the title of the comic from the TITLE tag.""" | |
self.parsing_title = False | |
class RSSMaker: | |
def __init__(self,url,title,comics,desc=None): | |
""" | |
Instantiates an RSSMaker, which basically needs a title | |
and a list of comics to create itself a DOM tree. A title | |
is also kinda nice. | |
""" | |
self.date_format = "%a, %d %b %Y %H:%M:%S %Z" | |
if url[-1] != "/": | |
url += "/" | |
self.url = url | |
impl = minidom.getDOMImplementation() | |
self.xmldocument = impl.createDocument(None, "rss", None) | |
self.xmldocument.documentElement.setAttribute("version","2.0") | |
self.chan_tag = self.xmldocument.createElement("channel") | |
self.xmldocument.documentElement.appendChild(self.chan_tag) | |
titletag = self.xmldocument.createElement("title") | |
if not title: | |
title = "Scraped RSS Feed of " + url | |
titletag.appendChild(self._mktxt(title)) | |
self.chan_tag.appendChild(titletag) | |
linktag = self.xmldocument.createElement("link") | |
linktag.appendChild(self._mktxt(url)) | |
self.chan_tag.appendChild(linktag) | |
desctag = self.xmldocument.createElement("description") | |
if not desc: | |
desc = "No description provided." | |
desctag.appendChild(self._mktxt(desc)) | |
self.chan_tag.appendChild(desctag) | |
lastbuilddatetag = self.xmldocument.createElement("pubDate") | |
date = time.strftime(self.date_format) | |
lastbuilddatetag.appendChild(self._mktxt(date)) | |
self.chan_tag.appendChild(lastbuilddatetag) | |
if comics: | |
self._load_comics(comics) | |
def _mktxt(self, text): | |
"""Convenience method for creating text nodes.""" | |
return self.xmldocument.createTextNode(text) | |
def _load_comics(self, comics): | |
""" | |
Private method called by the constructor that | |
actually loads in the comics and creates the relevant | |
DOM elements. | |
""" | |
for comic in comics: | |
comic_url = comic[0] | |
comic_item = self.xmldocument.createElement("item") | |
date_nums = [int(x) for x in comic[1].split("/")] | |
date_nums[-1] += 2000 | |
dt = datetime.datetime(date_nums[2],date_nums[0],date_nums[1]) | |
tup = dt.timetuple()[:-1] + (time.daylight,) | |
datestr = time.strftime(self.date_format, dt.timetuple()) | |
comic_date = self.xmldocument.createElement("pubDate") | |
comic_date.appendChild(self._mktxt(datestr)) | |
comic_title = self.xmldocument.createElement("title") | |
comic_title.appendChild(self._mktxt(comic[2])) | |
comic_link = self.xmldocument.createElement("link") | |
comic_link.appendChild(self._mktxt(comic[0])) | |
comic_desc = self.xmldocument.createElement("description") | |
desc = "<img src=\"%s\" alt=\"%s\" />"% (comic[0],comic[2]) | |
comic_desc.appendChild(self._mktxt(desc)) | |
comic_item.appendChild(comic_title) | |
comic_item.appendChild(comic_link) | |
comic_item.appendChild(comic_desc) | |
comic_item.appendChild(comic_date) | |
comic_guid = self.xmldocument.createElement("guid") | |
comic_guid.appendChild(self._mktxt(comic_url)) | |
comic_item.appendChild(comic_guid) | |
self.chan_tag.appendChild(comic_item) | |
def getfeed(self): | |
""" | |
Retrieve the XML document in all of it's glory. | |
presumably if the constructor didn't bubble up an exception | |
this method should give you a valid RSS feed. | |
""" | |
return self.xmldocument.toprettyxml() | |
def usage(argv): | |
print >> sys.stderr, "usage: %s -c {mtts,tfd,nd}" % (argv[0]) | |
if __name__ == "__main__": | |
import getopt | |
opts, args = getopt.getopt(sys.argv[1:],'c:') | |
try: | |
d = dict(opts) | |
comic = d['-c'] | |
except: | |
usage(sys.argv) | |
sys.exit(1) | |
scraper = ToothpasteScraper(comic=comic) | |
f = RSSMaker("http://www.%s.com/" % (longnames[comic]), \ | |
title=scraper.comic_title, comics=scraper.comic_list) | |
print f.getfeed() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment