xim/mangastream.py

## mangastream.py
#!/usr/bin/env python

"""%prog [-d] Manga [chapter]

NOTE: Does not work any more!
      I probably won't fix it. Ever.
      Patches accepted =)

Quick and dirty: Manga Stream direct download hack.
Fetches a named comic matching the regex given on cmd line. E.g:
%prog -d Naruto 505 # Creates Naruto_505.zip with corresponding ch."""

import logging
from optparse import OptionParser
import os
import re
import shutil
import socket
import sys
import tempfile
import urllib2
from zipfile import ZipFile

from BeautifulSoup import BeautifulSoup
import mechanize

# Hack, done so we time out after 5 seconds.
# Needed if mangastream server load is high and we should retry...
socket.setdefaulttimeout(5)

class MangaStreamScraper(mechanize.Browser):
    """ MangaStream scraper main (only) class. """

    def __init__(self, search, url='http://mangastream.com'):
        if not search:
            raise TypeError("Please provide a search regex on cmd line")
        mechanize.Browser.__init__(self)
        self.search = search
        self.url = url
        self.zipname = search.replace(' ', '_') + '.zip'
        self.tmp = tempfile.mkdtemp()
        logging.debug('Made tmp dir:')
        logging.debug(' ' + self.tmp)

        # Haxx, ojess I loves!
        self.set_handle_robots(False)
        self.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8')]

    def _get_page_urls(self):
        """ Get the URLs of the pages from the first manga page """
        self.open(self.url)
        try:
            self.follow_link(text_regex=re.compile(self.search))
        except mechanize.LinkNotFoundError:
            logging.critical('No link matching "%s" found on %s' %
                    (self.search, self.url))
            os.rmdir(self.tmp)
            exit(1)
        logging.debug('Got manga - ' + self.geturl())
        soup = BeautifulSoup(self._response.read())
        # TODO: Ugly anti-anti-scrape. Fix later :p
        selects = soup.find('div', {'id': 'page'}).findAll('select')
        page_select = None
        for select in selects:
            option = select.find('option', selected="selected")
            if option.has_key('selected'):
                try:
                    if int(option.string) < 2:
                        page_select = select
                except ValueError:
                    continue
        return [str(option['value']) for option in page_select or select if \
                hasattr(option, 'has_key') and \
                option.has_key('value') and \
                option['value']]

    def download_and_zip(self):
        """ Get the manga specified in search, download, zip """
        pages = self._get_page_urls()
        zipf = ZipFile(os.path.join(self.tmp, self.zipname), 'w')
        for page in pages:
            logging.debug('Page: ' + page)
            self.open(self.url + page)
            soup = BeautifulSoup(self._response.read())
            # TODO: The silly anti-scrape makes forces me to write silly code?
            imgs = soup.find('div', {'id': 'page'}).findAll('img')
            for img in imgs:
                if img.has_key('height'):
                    break
            img = img['src']
            logging.info(" downloading %s" % img)
            zipf.writestr(img.split('/')[-1], self.open(img).read())
            del soup
        zipf.close()
        self._cleanup()

    def open(self, url, *args, **kwargs):
        """ We want a more robust open() than Browser.open()

        This one handles (a more or less reasonable amount of) timeouts.
        """
        attempt = 0
        while attempt < 25:
            try:
                return mechanize.Browser.open(self, url, *args, **kwargs)
            except urllib2.URLError:
                logging.debug('Timed out getting %r, retrying' % url)
        raise urllib2.URLError('Timed out %d times on %r, giving up' % (attempt, url))

    def _cleanup(self):
        """ Remove tmp dir and tell the user where the file is """
        print "Go get it:"
        if not os.path.exists(self.zipname):
            shutil.move(os.path.join(self.tmp, self.zipname), '.')
            os.rmdir(self.tmp)
            print ' ' + os.path.join('.', self.zipname)
        else:
            print ' ' + os.path.join(self.tmp, self.zipname)

if __name__ == '__main__':
    parser = OptionParser(usage=__doc__)
    parser.add_option('-d', '--debug',  action='store_const', dest='loglevel',
            const=logging.DEBUG, default=logging.INFO, help='Debug verbosity')

    options, args = parser.parse_args()

    if options.loglevel == logging.DEBUG:
        LOG_FORMAT = "[%(levelname)8s %(asctime)s] %(message)s"
    else:
        LOG_FORMAT = "%(asctime)s: %(message)s"
    logging.basicConfig(level=options.loglevel, format=LOG_FORMAT)

    scraper = MangaStreamScraper(' '.join(args))
    scraper.download_and_zip()
	#!/usr/bin/env python

	"""%prog [-d] Manga [chapter]

	NOTE: Does not work any more!
	I probably won't fix it. Ever.
	Patches accepted =)

	Quick and dirty: Manga Stream direct download hack.
	Fetches a named comic matching the regex given on cmd line. E.g:
	%prog -d Naruto 505 # Creates Naruto_505.zip with corresponding ch."""

	import logging
	from optparse import OptionParser
	import os
	import re
	import shutil
	import socket
	import sys
	import tempfile
	import urllib2
	from zipfile import ZipFile

	from BeautifulSoup import BeautifulSoup
	import mechanize

	# Hack, done so we time out after 5 seconds.
	# Needed if mangastream server load is high and we should retry...
	socket.setdefaulttimeout(5)

	class MangaStreamScraper(mechanize.Browser):
	""" MangaStream scraper main (only) class. """

	def __init__(self, search, url='http://mangastream.com'):
	if not search:
	raise TypeError("Please provide a search regex on cmd line")
	mechanize.Browser.__init__(self)
	self.search = search
	self.url = url
	self.zipname = search.replace(' ', '_') + '.zip'
	self.tmp = tempfile.mkdtemp()
	logging.debug('Made tmp dir:')
	logging.debug(' ' + self.tmp)

	# Haxx, ojess I loves!
	self.set_handle_robots(False)
	self.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8')]

	def _get_page_urls(self):
	""" Get the URLs of the pages from the first manga page """
	self.open(self.url)
	try:
	self.follow_link(text_regex=re.compile(self.search))
	except mechanize.LinkNotFoundError:
	logging.critical('No link matching "%s" found on %s' %
	(self.search, self.url))
	os.rmdir(self.tmp)
	exit(1)
	logging.debug('Got manga - ' + self.geturl())
	soup = BeautifulSoup(self._response.read())
	# TODO: Ugly anti-anti-scrape. Fix later :p
	selects = soup.find('div', {'id': 'page'}).findAll('select')
	page_select = None
	for select in selects:
	option = select.find('option', selected="selected")
	if option.has_key('selected'):
	try:
	if int(option.string) < 2:
	page_select = select
	except ValueError:
	continue
	return [str(option['value']) for option in page_select or select if \
	hasattr(option, 'has_key') and \
	option.has_key('value') and \
	option['value']]

	def download_and_zip(self):
	""" Get the manga specified in search, download, zip """
	pages = self._get_page_urls()
	zipf = ZipFile(os.path.join(self.tmp, self.zipname), 'w')
	for page in pages:
	logging.debug('Page: ' + page)
	self.open(self.url + page)
	soup = BeautifulSoup(self._response.read())
	# TODO: The silly anti-scrape makes forces me to write silly code?
	imgs = soup.find('div', {'id': 'page'}).findAll('img')
	for img in imgs:
	if img.has_key('height'):
	break
	img = img['src']
	logging.info(" downloading %s" % img)
	zipf.writestr(img.split('/')[-1], self.open(img).read())
	del soup
	zipf.close()
	self._cleanup()

	def open(self, url, args, *kwargs):
	""" We want a more robust open() than Browser.open()

	This one handles (a more or less reasonable amount of) timeouts.
	"""
	attempt = 0
	while attempt < 25:
	try:
	return mechanize.Browser.open(self, url, args, *kwargs)
	except urllib2.URLError:
	logging.debug('Timed out getting %r, retrying' % url)
	raise urllib2.URLError('Timed out %d times on %r, giving up' % (attempt, url))

	def _cleanup(self):
	""" Remove tmp dir and tell the user where the file is """
	print "Go get it:"
	if not os.path.exists(self.zipname):
	shutil.move(os.path.join(self.tmp, self.zipname), '.')
	os.rmdir(self.tmp)
	print ' ' + os.path.join('.', self.zipname)
	else:
	print ' ' + os.path.join(self.tmp, self.zipname)

	if __name__ == '__main__':
	parser = OptionParser(usage=__doc__)
	parser.add_option('-d', '--debug', action='store_const', dest='loglevel',
	const=logging.DEBUG, default=logging.INFO, help='Debug verbosity')

	options, args = parser.parse_args()

	if options.loglevel == logging.DEBUG:
	LOG_FORMAT = "[%(levelname)8s %(asctime)s] %(message)s"
	else:
	LOG_FORMAT = "%(asctime)s: %(message)s"
	logging.basicConfig(level=options.loglevel, format=LOG_FORMAT)

	scraper = MangaStreamScraper(' '.join(args))
	scraper.download_and_zip()