dwf/tfd.py

## tfd.py
"""
tfd.py -- Scrapes a certain set of comics' websites, and spits
          out home-grown RSS feeds for them.

This is pretty useless now that all of the Sharing Machine comics
provide RSS feeds, but might serve as an okay example of old-school
screen scraping and XML generation in Python. I hear the cool kids
use BeautifulSoup (for scraping) and lxml (for XML generation)
nowadays.

By David Warde-Farley -- user AT cs dot toronto dot edu (user = dwf)

Redistributable under the terms of the 3-clause BSD license
(see http://www.opensource.org/licenses/bsd-license.php for details)
"""
import sgmllib
import re
import urllib
import sys
import xml.dom.minidom as minidom
import time, datetime

DEBUG = False


"""Mappings from the short acronyms to the main part of the domains."""
longnames = {
    'mtts' : 'marriedtothesea',
    'tfd'  : 'toothpastefordinner',
    'nd'   : 'nataliedee'
    }

"""Months of the year. You'd think there's be a simple built-in way?"""
months = ['jan','feb','mar','apr','may','jun',
    'jul','aug','sep','oct','nov','dec']

class ToothpasteScraper(sgmllib.SGMLParser):

    """
    Scrapes a certain comic's main website (as of 03/13/08) to grab a list
    of recent comics for further processing. A list of tuples is available
    in the instance variable 'comic_list' after processing.

    Essentially works from a 'Visitor' pattern with hooks provided by
    SGMLParser, we just look for the links that are actual links to comics
    (or the title) and maintain some state to process stuff between those
    tags only when relevant (start_a and start_title set flags that
    let the next handle_data call know that he's up at bat for something.)
    """

    def __init__(self, comic=None):
        """
        Instantiate a ToothpasteScraper. If an URL is given, the URL is
        opened with urllib and feed() is called immediately with the
        resulting string.
        """
        # Empty comic list.
        self.comic_list = []

        # Title - uhhhh.
        self.comic_title = "(No comic title)"

        # Boolean state flags so that we know when to care about textual data
        self.parsing_comic_link = False
        self.parsing_title = False

        # Regular expression we'd like to match against.
        self.URL_PATTERN = \
            r"http://www.%s.com/[01]\d{5}/([a-zA-Z_]|\-)+\.(gif|jpg)" % \
            (longnames[comic],)

        # Instantiate superclass.
        sgmllib.SGMLParser.__init__(self)

        self._go(comic)

    def _go(self,comic):
        """Starts the parser to work with data fed from an URL connection."""
        today = datetime.datetime.today()
        month = months[today.month - 1]
        year = str(today.year)[-2:]
        url = 'http://www.%s.com/%s-archives/%sarchive-%s%s.php' % \
        (longnames[comic], comic, comic, month, year)
        if DEBUG:
            print >> sys.stderr, "Fetching " + url
        handle = urllib.urlopen(url)
        data = handle.read()
        handle.close()
        self.feed(data)
        self.close()


    def start_a(self,attributes):
        """
        Called when an opening anchor tag is encountered. Basically regex
        matches HREF for comic links and scrapes it on match.
        """
        atts = dict(attributes)
        if 'href' in atts and re.match(self.URL_PATTERN, atts['href']):
            self.parsing_comic_link = True
            if DEBUG:
                print >>sys.stderr, atts['href'].split("/")
            raw_d = atts['href'].split("/")[3]
            self.cur_date = "%s/%s/%s" % (raw_d[0:2],raw_d[2:4],raw_d[4:6])
            self.cur_link = atts['href']

    def handle_data(self, data):
        """
        Handles arbitrary data, but only actually does anything if we're
        inside a comic link or a title tag.
        """
        if self.parsing_comic_link:
            self.cur_text = data.strip() # Not sure if the strip is necessary.
        elif self.parsing_title:
            self.comic_title = "".join(data.strip().split("\n"))

    def end_a(self):
        """
        Called when an ending anchor tag is encountered. If a link is being
        read, it's finalized and added to the list. Otherwise nothing
        happens.
        """
        if self.parsing_comic_link:
            self.comic_list.append((self.cur_link,self.cur_date,self.cur_text))
            del self.cur_link, self.cur_date, self.cur_text

        # Done outside the if for robustness or something.
        self.parsing_comic_link = False

    def start_title(self, attributes):
        """Used to scrape the title of the comic from the TITLE tag."""
        self.parsing_title = True

    def end_title(self):
        """Used to scrape the title of the comic from the TITLE tag."""
        self.parsing_title = False

class RSSMaker:
    def __init__(self,url,title,comics,desc=None):
        """
        Instantiates an RSSMaker, which basically needs a title
        and a list of comics to create itself a DOM tree. A title
        is also kinda nice.
        """
        self.date_format = "%a, %d %b %Y %H:%M:%S %Z"

        if url[-1] != "/":
            url += "/"
        self.url = url

        impl = minidom.getDOMImplementation()
        self.xmldocument = impl.createDocument(None, "rss", None)
        self.xmldocument.documentElement.setAttribute("version","2.0")
        self.chan_tag = self.xmldocument.createElement("channel")
        self.xmldocument.documentElement.appendChild(self.chan_tag)

        titletag = self.xmldocument.createElement("title")
        if not title:
            title = "Scraped RSS Feed of " + url
        titletag.appendChild(self._mktxt(title))
        self.chan_tag.appendChild(titletag)

        linktag = self.xmldocument.createElement("link")
        linktag.appendChild(self._mktxt(url))
        self.chan_tag.appendChild(linktag)

        desctag = self.xmldocument.createElement("description")
        if not desc:
            desc = "No description provided."
        desctag.appendChild(self._mktxt(desc))
        self.chan_tag.appendChild(desctag)

        lastbuilddatetag = self.xmldocument.createElement("pubDate")
        date = time.strftime(self.date_format)
        lastbuilddatetag.appendChild(self._mktxt(date))
        self.chan_tag.appendChild(lastbuilddatetag)

        if comics:
            self._load_comics(comics)

    def _mktxt(self, text):
        """Convenience method for creating text nodes."""
        return self.xmldocument.createTextNode(text)

    def _load_comics(self, comics):
        """
        Private method called by the constructor that
        actually loads in the comics and creates the relevant
        DOM elements.
        """
        for comic in comics:
            comic_url = comic[0]
            comic_item = self.xmldocument.createElement("item")

            date_nums = [int(x) for x in comic[1].split("/")]
            date_nums[-1] += 2000
            dt = datetime.datetime(date_nums[2],date_nums[0],date_nums[1])
            tup = dt.timetuple()[:-1] + (time.daylight,)
            datestr = time.strftime(self.date_format, dt.timetuple())
            comic_date = self.xmldocument.createElement("pubDate")
            comic_date.appendChild(self._mktxt(datestr))

            comic_title = self.xmldocument.createElement("title")
            comic_title.appendChild(self._mktxt(comic[2]))
            comic_link = self.xmldocument.createElement("link")
            comic_link.appendChild(self._mktxt(comic[0]))
            comic_desc = self.xmldocument.createElement("description")
            desc = "<img src=\"%s\" alt=\"%s\" />"% (comic[0],comic[2])
            comic_desc.appendChild(self._mktxt(desc))

            comic_item.appendChild(comic_title)
            comic_item.appendChild(comic_link)
            comic_item.appendChild(comic_desc)
            comic_item.appendChild(comic_date)

            comic_guid = self.xmldocument.createElement("guid")
            comic_guid.appendChild(self._mktxt(comic_url))
            comic_item.appendChild(comic_guid)

            self.chan_tag.appendChild(comic_item)

    def getfeed(self):
        """
        Retrieve the XML document in all of it's glory.
        presumably if the constructor didn't bubble up an exception
        this method should give you a valid RSS feed.
        """
        return self.xmldocument.toprettyxml()

def usage(argv):
    print >> sys.stderr, "usage: %s -c {mtts,tfd,nd}" % (argv[0])


if __name__ == "__main__":
    import getopt
    opts, args = getopt.getopt(sys.argv[1:],'c:')
    try:
        d = dict(opts)
        comic = d['-c']
    except:
        usage(sys.argv)
        sys.exit(1)

    scraper = ToothpasteScraper(comic=comic)
    f = RSSMaker("http://www.%s.com/" % (longnames[comic]), \
        title=scraper.comic_title, comics=scraper.comic_list)

    print f.getfeed()
	"""
	tfd.py -- Scrapes a certain set of comics' websites, and spits
	out home-grown RSS feeds for them.

	This is pretty useless now that all of the Sharing Machine comics
	provide RSS feeds, but might serve as an okay example of old-school
	screen scraping and XML generation in Python. I hear the cool kids
	use BeautifulSoup (for scraping) and lxml (for XML generation)
	nowadays.

	By David Warde-Farley -- user AT cs dot toronto dot edu (user = dwf)

	Redistributable under the terms of the 3-clause BSD license
	(see http://www.opensource.org/licenses/bsd-license.php for details)
	"""
	import sgmllib
	import re
	import urllib
	import sys
	import xml.dom.minidom as minidom
	import time, datetime

	DEBUG = False


	"""Mappings from the short acronyms to the main part of the domains."""
	longnames = {
	'mtts' : 'marriedtothesea',
	'tfd' : 'toothpastefordinner',
	'nd' : 'nataliedee'
	}

	"""Months of the year. You'd think there's be a simple built-in way?"""
	months = ['jan','feb','mar','apr','may','jun',
	'jul','aug','sep','oct','nov','dec']

	class ToothpasteScraper(sgmllib.SGMLParser):

	"""
	Scrapes a certain comic's main website (as of 03/13/08) to grab a list
	of recent comics for further processing. A list of tuples is available
	in the instance variable 'comic_list' after processing.

	Essentially works from a 'Visitor' pattern with hooks provided by
	SGMLParser, we just look for the links that are actual links to comics
	(or the title) and maintain some state to process stuff between those
	tags only when relevant (start_a and start_title set flags that
	let the next handle_data call know that he's up at bat for something.)
	"""

	def __init__(self, comic=None):
	"""
	Instantiate a ToothpasteScraper. If an URL is given, the URL is
	opened with urllib and feed() is called immediately with the
	resulting string.
	"""
	# Empty comic list.
	self.comic_list = []

	# Title - uhhhh.
	self.comic_title = "(No comic title)"

	# Boolean state flags so that we know when to care about textual data
	self.parsing_comic_link = False
	self.parsing_title = False

	# Regular expression we'd like to match against.
	self.URL_PATTERN = \
	r"http://www.%s.com/[01]\d{5}/([a-zA-Z_]\|\-)+\.(gif\|jpg)" % \
	(longnames[comic],)

	# Instantiate superclass.
	sgmllib.SGMLParser.__init__(self)

	self._go(comic)

	def _go(self,comic):
	"""Starts the parser to work with data fed from an URL connection."""
	today = datetime.datetime.today()
	month = months[today.month - 1]
	year = str(today.year)[-2:]
	url = 'http://www.%s.com/%s-archives/%sarchive-%s%s.php' % \
	(longnames[comic], comic, comic, month, year)
	if DEBUG:
	print >> sys.stderr, "Fetching " + url
	handle = urllib.urlopen(url)
	data = handle.read()
	handle.close()
	self.feed(data)
	self.close()


	def start_a(self,attributes):
	"""
	Called when an opening anchor tag is encountered. Basically regex
	matches HREF for comic links and scrapes it on match.
	"""
	atts = dict(attributes)
	if 'href' in atts and re.match(self.URL_PATTERN, atts['href']):
	self.parsing_comic_link = True
	if DEBUG:
	print >>sys.stderr, atts['href'].split("/")
	raw_d = atts['href'].split("/")[3]
	self.cur_date = "%s/%s/%s" % (raw_d[0:2],raw_d[2:4],raw_d[4:6])
	self.cur_link = atts['href']

	def handle_data(self, data):
	"""
	Handles arbitrary data, but only actually does anything if we're
	inside a comic link or a title tag.
	"""
	if self.parsing_comic_link:
	self.cur_text = data.strip() # Not sure if the strip is necessary.
	elif self.parsing_title:
	self.comic_title = "".join(data.strip().split("\n"))

	def end_a(self):
	"""
	Called when an ending anchor tag is encountered. If a link is being
	read, it's finalized and added to the list. Otherwise nothing
	happens.
	"""
	if self.parsing_comic_link:
	self.comic_list.append((self.cur_link,self.cur_date,self.cur_text))
	del self.cur_link, self.cur_date, self.cur_text

	# Done outside the if for robustness or something.
	self.parsing_comic_link = False

	def start_title(self, attributes):
	"""Used to scrape the title of the comic from the TITLE tag."""
	self.parsing_title = True

	def end_title(self):
	"""Used to scrape the title of the comic from the TITLE tag."""
	self.parsing_title = False

	class RSSMaker:
	def __init__(self,url,title,comics,desc=None):
	"""
	Instantiates an RSSMaker, which basically needs a title
	and a list of comics to create itself a DOM tree. A title
	is also kinda nice.
	"""
	self.date_format = "%a, %d %b %Y %H:%M:%S %Z"

	if url[-1] != "/":
	url += "/"
	self.url = url

	impl = minidom.getDOMImplementation()
	self.xmldocument = impl.createDocument(None, "rss", None)
	self.xmldocument.documentElement.setAttribute("version","2.0")
	self.chan_tag = self.xmldocument.createElement("channel")
	self.xmldocument.documentElement.appendChild(self.chan_tag)

	titletag = self.xmldocument.createElement("title")
	if not title:
	title = "Scraped RSS Feed of " + url
	titletag.appendChild(self._mktxt(title))
	self.chan_tag.appendChild(titletag)

	linktag = self.xmldocument.createElement("link")
	linktag.appendChild(self._mktxt(url))
	self.chan_tag.appendChild(linktag)

	desctag = self.xmldocument.createElement("description")
	if not desc:
	desc = "No description provided."
	desctag.appendChild(self._mktxt(desc))
	self.chan_tag.appendChild(desctag)

	lastbuilddatetag = self.xmldocument.createElement("pubDate")
	date = time.strftime(self.date_format)
	lastbuilddatetag.appendChild(self._mktxt(date))
	self.chan_tag.appendChild(lastbuilddatetag)

	if comics:
	self._load_comics(comics)

	def _mktxt(self, text):
	"""Convenience method for creating text nodes."""
	return self.xmldocument.createTextNode(text)

	def _load_comics(self, comics):
	"""
	Private method called by the constructor that
	actually loads in the comics and creates the relevant
	DOM elements.
	"""
	for comic in comics:
	comic_url = comic[0]
	comic_item = self.xmldocument.createElement("item")

	date_nums = [int(x) for x in comic[1].split("/")]
	date_nums[-1] += 2000
	dt = datetime.datetime(date_nums[2],date_nums[0],date_nums[1])
	tup = dt.timetuple()[:-1] + (time.daylight,)
	datestr = time.strftime(self.date_format, dt.timetuple())
	comic_date = self.xmldocument.createElement("pubDate")
	comic_date.appendChild(self._mktxt(datestr))

	comic_title = self.xmldocument.createElement("title")
	comic_title.appendChild(self._mktxt(comic[2]))
	comic_link = self.xmldocument.createElement("link")
	comic_link.appendChild(self._mktxt(comic[0]))
	comic_desc = self.xmldocument.createElement("description")
	desc = "<img src=\"%s\" alt=\"%s\" />"% (comic[0],comic[2])
	comic_desc.appendChild(self._mktxt(desc))

	comic_item.appendChild(comic_title)
	comic_item.appendChild(comic_link)
	comic_item.appendChild(comic_desc)
	comic_item.appendChild(comic_date)

	comic_guid = self.xmldocument.createElement("guid")
	comic_guid.appendChild(self._mktxt(comic_url))
	comic_item.appendChild(comic_guid)

	self.chan_tag.appendChild(comic_item)

	def getfeed(self):
	"""
	Retrieve the XML document in all of it's glory.
	presumably if the constructor didn't bubble up an exception
	this method should give you a valid RSS feed.
	"""
	return self.xmldocument.toprettyxml()

	def usage(argv):
	print >> sys.stderr, "usage: %s -c {mtts,tfd,nd}" % (argv[0])


	if __name__ == "__main__":
	import getopt
	opts, args = getopt.getopt(sys.argv[1:],'c:')
	try:
	d = dict(opts)
	comic = d['-c']
	except:
	usage(sys.argv)
	sys.exit(1)

	scraper = ToothpasteScraper(comic=comic)
	f = RSSMaker("http://www.%s.com/" % (longnames[comic]), \
	title=scraper.comic_title, comics=scraper.comic_list)

	print f.getfeed()