Skip to content

Instantly share code, notes, and snippets.

@xim
Created August 5, 2010 02:48
Show Gist options
  • Save xim/509152 to your computer and use it in GitHub Desktop.
Save xim/509152 to your computer and use it in GitHub Desktop.
Manga Stream “direct download” by scraper
#!/usr/bin/env python
"""%prog [-d] Manga [chapter]
NOTE: Does not work any more!
I probably won't fix it. Ever.
Patches accepted =)
Quick and dirty: Manga Stream direct download hack.
Fetches a named comic matching the regex given on cmd line. E.g:
%prog -d Naruto 505 # Creates Naruto_505.zip with corresponding ch."""
import logging
from optparse import OptionParser
import os
import re
import shutil
import socket
import sys
import tempfile
import urllib2
from zipfile import ZipFile
from BeautifulSoup import BeautifulSoup
import mechanize
# Hack, done so we time out after 5 seconds.
# Needed if mangastream server load is high and we should retry...
socket.setdefaulttimeout(5)
class MangaStreamScraper(mechanize.Browser):
""" MangaStream scraper main (only) class. """
def __init__(self, search, url='http://mangastream.com'):
if not search:
raise TypeError("Please provide a search regex on cmd line")
mechanize.Browser.__init__(self)
self.search = search
self.url = url
self.zipname = search.replace(' ', '_') + '.zip'
self.tmp = tempfile.mkdtemp()
logging.debug('Made tmp dir:')
logging.debug(' ' + self.tmp)
# Haxx, ojess I loves!
self.set_handle_robots(False)
self.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8')]
def _get_page_urls(self):
""" Get the URLs of the pages from the first manga page """
self.open(self.url)
try:
self.follow_link(text_regex=re.compile(self.search))
except mechanize.LinkNotFoundError:
logging.critical('No link matching "%s" found on %s' %
(self.search, self.url))
os.rmdir(self.tmp)
exit(1)
logging.debug('Got manga - ' + self.geturl())
soup = BeautifulSoup(self._response.read())
# TODO: Ugly anti-anti-scrape. Fix later :p
selects = soup.find('div', {'id': 'page'}).findAll('select')
page_select = None
for select in selects:
option = select.find('option', selected="selected")
if option.has_key('selected'):
try:
if int(option.string) < 2:
page_select = select
except ValueError:
continue
return [str(option['value']) for option in page_select or select if \
hasattr(option, 'has_key') and \
option.has_key('value') and \
option['value']]
def download_and_zip(self):
""" Get the manga specified in search, download, zip """
pages = self._get_page_urls()
zipf = ZipFile(os.path.join(self.tmp, self.zipname), 'w')
for page in pages:
logging.debug('Page: ' + page)
self.open(self.url + page)
soup = BeautifulSoup(self._response.read())
# TODO: The silly anti-scrape makes forces me to write silly code?
imgs = soup.find('div', {'id': 'page'}).findAll('img')
for img in imgs:
if img.has_key('height'):
break
img = img['src']
logging.info(" downloading %s" % img)
zipf.writestr(img.split('/')[-1], self.open(img).read())
del soup
zipf.close()
self._cleanup()
def open(self, url, *args, **kwargs):
""" We want a more robust open() than Browser.open()
This one handles (a more or less reasonable amount of) timeouts.
"""
attempt = 0
while attempt < 25:
try:
return mechanize.Browser.open(self, url, *args, **kwargs)
except urllib2.URLError:
logging.debug('Timed out getting %r, retrying' % url)
raise urllib2.URLError('Timed out %d times on %r, giving up' % (attempt, url))
def _cleanup(self):
""" Remove tmp dir and tell the user where the file is """
print "Go get it:"
if not os.path.exists(self.zipname):
shutil.move(os.path.join(self.tmp, self.zipname), '.')
os.rmdir(self.tmp)
print ' ' + os.path.join('.', self.zipname)
else:
print ' ' + os.path.join(self.tmp, self.zipname)
if __name__ == '__main__':
parser = OptionParser(usage=__doc__)
parser.add_option('-d', '--debug', action='store_const', dest='loglevel',
const=logging.DEBUG, default=logging.INFO, help='Debug verbosity')
options, args = parser.parse_args()
if options.loglevel == logging.DEBUG:
LOG_FORMAT = "[%(levelname)8s %(asctime)s] %(message)s"
else:
LOG_FORMAT = "%(asctime)s: %(message)s"
logging.basicConfig(level=options.loglevel, format=LOG_FORMAT)
scraper = MangaStreamScraper(' '.join(args))
scraper.download_and_zip()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment