Created
August 5, 2010 02:48
-
-
Save xim/509152 to your computer and use it in GitHub Desktop.
Manga Stream “direct download” by scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""%prog [-d] Manga [chapter] | |
NOTE: Does not work any more! | |
I probably won't fix it. Ever. | |
Patches accepted =) | |
Quick and dirty: Manga Stream direct download hack. | |
Fetches a named comic matching the regex given on cmd line. E.g: | |
%prog -d Naruto 505 # Creates Naruto_505.zip with corresponding ch.""" | |
import logging | |
from optparse import OptionParser | |
import os | |
import re | |
import shutil | |
import socket | |
import sys | |
import tempfile | |
import urllib2 | |
from zipfile import ZipFile | |
from BeautifulSoup import BeautifulSoup | |
import mechanize | |
# Hack, done so we time out after 5 seconds. | |
# Needed if mangastream server load is high and we should retry... | |
socket.setdefaulttimeout(5) | |
class MangaStreamScraper(mechanize.Browser): | |
""" MangaStream scraper main (only) class. """ | |
def __init__(self, search, url='http://mangastream.com'): | |
if not search: | |
raise TypeError("Please provide a search regex on cmd line") | |
mechanize.Browser.__init__(self) | |
self.search = search | |
self.url = url | |
self.zipname = search.replace(' ', '_') + '.zip' | |
self.tmp = tempfile.mkdtemp() | |
logging.debug('Made tmp dir:') | |
logging.debug(' ' + self.tmp) | |
# Haxx, ojess I loves! | |
self.set_handle_robots(False) | |
self.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8')] | |
def _get_page_urls(self): | |
""" Get the URLs of the pages from the first manga page """ | |
self.open(self.url) | |
try: | |
self.follow_link(text_regex=re.compile(self.search)) | |
except mechanize.LinkNotFoundError: | |
logging.critical('No link matching "%s" found on %s' % | |
(self.search, self.url)) | |
os.rmdir(self.tmp) | |
exit(1) | |
logging.debug('Got manga - ' + self.geturl()) | |
soup = BeautifulSoup(self._response.read()) | |
# TODO: Ugly anti-anti-scrape. Fix later :p | |
selects = soup.find('div', {'id': 'page'}).findAll('select') | |
page_select = None | |
for select in selects: | |
option = select.find('option', selected="selected") | |
if option.has_key('selected'): | |
try: | |
if int(option.string) < 2: | |
page_select = select | |
except ValueError: | |
continue | |
return [str(option['value']) for option in page_select or select if \ | |
hasattr(option, 'has_key') and \ | |
option.has_key('value') and \ | |
option['value']] | |
def download_and_zip(self): | |
""" Get the manga specified in search, download, zip """ | |
pages = self._get_page_urls() | |
zipf = ZipFile(os.path.join(self.tmp, self.zipname), 'w') | |
for page in pages: | |
logging.debug('Page: ' + page) | |
self.open(self.url + page) | |
soup = BeautifulSoup(self._response.read()) | |
# TODO: The silly anti-scrape makes forces me to write silly code? | |
imgs = soup.find('div', {'id': 'page'}).findAll('img') | |
for img in imgs: | |
if img.has_key('height'): | |
break | |
img = img['src'] | |
logging.info(" downloading %s" % img) | |
zipf.writestr(img.split('/')[-1], self.open(img).read()) | |
del soup | |
zipf.close() | |
self._cleanup() | |
def open(self, url, *args, **kwargs): | |
""" We want a more robust open() than Browser.open() | |
This one handles (a more or less reasonable amount of) timeouts. | |
""" | |
attempt = 0 | |
while attempt < 25: | |
try: | |
return mechanize.Browser.open(self, url, *args, **kwargs) | |
except urllib2.URLError: | |
logging.debug('Timed out getting %r, retrying' % url) | |
raise urllib2.URLError('Timed out %d times on %r, giving up' % (attempt, url)) | |
def _cleanup(self): | |
""" Remove tmp dir and tell the user where the file is """ | |
print "Go get it:" | |
if not os.path.exists(self.zipname): | |
shutil.move(os.path.join(self.tmp, self.zipname), '.') | |
os.rmdir(self.tmp) | |
print ' ' + os.path.join('.', self.zipname) | |
else: | |
print ' ' + os.path.join(self.tmp, self.zipname) | |
if __name__ == '__main__': | |
parser = OptionParser(usage=__doc__) | |
parser.add_option('-d', '--debug', action='store_const', dest='loglevel', | |
const=logging.DEBUG, default=logging.INFO, help='Debug verbosity') | |
options, args = parser.parse_args() | |
if options.loglevel == logging.DEBUG: | |
LOG_FORMAT = "[%(levelname)8s %(asctime)s] %(message)s" | |
else: | |
LOG_FORMAT = "%(asctime)s: %(message)s" | |
logging.basicConfig(level=options.loglevel, format=LOG_FORMAT) | |
scraper = MangaStreamScraper(' '.join(args)) | |
scraper.download_and_zip() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment