Rimbo/fetchComic.py

## fetchComic.py
#!/usr/local/bin/python

from bs4 import BeautifulSoup
from urllib2 import urlopen
from urllib import urlretrieve
import argparse

class getComic:
	"""Base class for comics
	"""

	# override these
	comicName=None
	host=None
	firstComic=None

	def getImage( self, pgsoup ):
		"""override this
		"""
		return None

	def getNext(  self, pgsoup ):
		"""override this
		"""
		return None

        # Don't override these

	def grabPage( self, url ):
		pg = urlopen( url )
		pgsoup = BeautifulSoup( pg )
		imageurl = self.getImage( pgsoup )
		nexturl =  self.getNext(  pgsoup )
		return imageurl,nexturl

	def getImages( self ):

		savename = self.comicName + "_%05d.jpg"
		count = 0
		nexturl = self.host % self.firstComic

		while nexturl:
			comic,nexturl = self.grabPage( nexturl )
			print "Grabbing " + savename % count
			urlretrieve( comic, savename % count )
			count+=1


class BattlePug( getComic ):

	comicName = "BattlePug"
	host = "http://battlepug.com/%s"
	firstComic = "/comic/first"

	def getImage( self, pgsoup ):
                pgcomic=pgsoup.body.find('div',id='comic')
                pgpage=pgcomic.find('div','page')
                imageurl = pgpage.img['data-image']
                return imageurl

        def getNext(  self, pgsoup ):
                pgtrans=pgsoup.body.find('div',id='transport')
                pgnext=pgtrans.find_all('a')[3]  #next
                if pgnext.has_attr('href'):
                        nexturl=pgnext['href']
                else:
                        nexturl=None
                return self.host % nexturl


class BuckGodot( getComic ):

        comicName = "Buck Godot"
        host = "http://www.airshipentertainment.com/%s"
        firstComic = "buckcomic.php?date=20070111"

        def getImage( self, pgsoup ):
                i = pgsoup.find('img',alt='Comic')
                return i['src']

        def getNext(  self, pgsoup ):
                n = pgsoup.find('img',alt='The Next Comic')
                return n.parent['href']

class GirlGenius( getComic ):

        comicName = "Girl Genius"
        host = "http://www.girlgeniusonline.com/%s"
        firstComic = "comic.php?date=20021104"

        def getImage( self, pgsoup ):
                i = pgsoup.find('img',alt='Comic')
                return i['src']

        def getNext(  self, pgsoup ):
                n = pgsoup.find('a',title='The Next Comic')
                return n['href']


if __name__ == '__main__':

        parser = argparse.ArgumentParser()
        parser.add_argument( 'comicName', type = str, help = "the comic to download")
        args = parser.parse_args()

	sparkles = BattlePug()
        winslow  = BuckGodot()
        agatha   = GirlGenius()

        for bla in [ sparkles, winslow, agatha ]:
                if ( args.comicName == bla.comicName ):
                        bla.getImages()
	#!/usr/local/bin/python

	from bs4 import BeautifulSoup
	from urllib2 import urlopen
	from urllib import urlretrieve
	import argparse

	class getComic:
	"""Base class for comics
	"""

	# override these
	comicName=None
	host=None
	firstComic=None

	def getImage( self, pgsoup ):
	"""override this
	"""
	return None

	def getNext( self, pgsoup ):
	"""override this
	"""
	return None

	# Don't override these

	def grabPage( self, url ):
	pg = urlopen( url )
	pgsoup = BeautifulSoup( pg )
	imageurl = self.getImage( pgsoup )
	nexturl = self.getNext( pgsoup )
	return imageurl,nexturl

	def getImages( self ):

	savename = self.comicName + "_%05d.jpg"
	count = 0
	nexturl = self.host % self.firstComic

	while nexturl:
	comic,nexturl = self.grabPage( nexturl )
	print "Grabbing " + savename % count
	urlretrieve( comic, savename % count )
	count+=1


	class BattlePug( getComic ):

	comicName = "BattlePug"
	host = "http://battlepug.com/%s"
	firstComic = "/comic/first"

	def getImage( self, pgsoup ):
	pgcomic=pgsoup.body.find('div',id='comic')
	pgpage=pgcomic.find('div','page')
	imageurl = pgpage.img['data-image']
	return imageurl

	def getNext( self, pgsoup ):
	pgtrans=pgsoup.body.find('div',id='transport')
	pgnext=pgtrans.find_all('a')[3] #next
	if pgnext.has_attr('href'):
	nexturl=pgnext['href']
	else:
	nexturl=None
	return self.host % nexturl


	class BuckGodot( getComic ):

	comicName = "Buck Godot"
	host = "http://www.airshipentertainment.com/%s"
	firstComic = "buckcomic.php?date=20070111"

	def getImage( self, pgsoup ):
	i = pgsoup.find('img',alt='Comic')
	return i['src']

	def getNext( self, pgsoup ):
	n = pgsoup.find('img',alt='The Next Comic')
	return n.parent['href']

	class GirlGenius( getComic ):

	comicName = "Girl Genius"
	host = "http://www.girlgeniusonline.com/%s"
	firstComic = "comic.php?date=20021104"

	def getImage( self, pgsoup ):
	i = pgsoup.find('img',alt='Comic')
	return i['src']

	def getNext( self, pgsoup ):
	n = pgsoup.find('a',title='The Next Comic')
	return n['href']


	if __name__ == '__main__':

	parser = argparse.ArgumentParser()
	parser.add_argument( 'comicName', type = str, help = "the comic to download")
	args = parser.parse_args()

	sparkles = BattlePug()
	winslow = BuckGodot()
	agatha = GirlGenius()

	for bla in [ sparkles, winslow, agatha ]:
	if ( args.comicName == bla.comicName ):
	bla.getImages()