muse/EHentai

## EHentai
#! /usr/bin/env python
# Made by 'Mirko van der Waal'
# Distributed under terms of the MIT license.

try:
    from HTMLParser import HTMLParser
    from urllib2    import urlopen
    from re         import search
    from os         import getenv, mkdir
    from os.path    import join as merge
    from sys        import argv, exit
    from uuid       import uuid4

    import getopt

except ImportError as e:
    print e, exit(0)

URL         = ''
IMAGE_URLS  = []
IMAGES      = []
PAGE_URLS   = 40
TIMEOUT     = 15
PREFIX      = 'image-'
FORMAT      = '.jpg'
OUTPUT_REDUCTION = 32
SILENT      = False
TITLE       = str(uuid4())[:6]
HOME_DIR    = getenv('HOME')
OUT_DIR     = merge(HOME_DIR + '/Pictures')

class HTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        # For every attribute in attributes, we test them to conditions.

        for attr in attrs:
            try:
                # Is the parameter equal to 'href' and does the value match
                # the /s/ regex. /s/ looks for hrefs that contain the images.
                # As the images are stored in this urlpath.
                # Some other paths I know are;
                # /tag/<tagname>, /uploader/<username>, /g/<id>/<id>
                if search('/s/', attr[1]).group(0):
                    # Write the URL to a array of images for us to open later.
                    IMAGE_URLS.append(attr[1])

            # Occasionally, -most of the time; we we gain NoneType because
            # there is not match. We simply 'pass' this off because it isn't useful.
            except:
                pass

# The default GETOPT module.
try:
    opts, args = getopt.getopt(argv[1:],
        ':hu:n:p:x:f:o:t:i:s',
    ['url=', 'name=', 'path=', 'prefix=', 'format=',
     'output-reduction=', 'timeout=', 'image-limit=', 'silent', 'help'])

except Exception as e:
    print e, exit(0)

for o, a in opts:
    if   o in ('-u', '--url'):      URL     = a
    elif o in ('-n', '--name'):     TITLE   = a
    elif o in ('-p', '--path'):     OUT_DIR = a
    elif o in ('-i', '--image-limit'):
                                    PAGE_URLS = a
    elif o in ('-t', '--timeout'):  TIMEOUT = a
    elif o in ('-x', '--prefix'):   PREFIX  = a
    elif o in ('-f', '--format'):   FORMAT  = a
    elif o in ('-s', '--silent'):   SILENT  = True
    elif o in ('-o', '--output-reduction'):
                                    OUTPUT_REDUCTION = a
    elif o in ('-h', '--help'):
        print """
        -u, --url <value>
        The gallery you want to download, make sure to supply a full url.
        This is the one and only required parameter.
        [Default: Input]

        -n, --name <value>
        The folder name where we extract the content.
        [Default: A UUID4 6 character string.]

        -p, --path <value>
        The path where to extract the Images to.
        [Default: ~/Pictures|$HOME/Pictures]

        -x, --prefix <value>
        The downloaded Image prefix.
        [Default: image-<index>]

        -f, --format <value>
        The image format to save as.
        [Default: .jpg]

        -o, --output-reduction <value>
        Reduce the length of the original source url when outputting the process.
        [Default: 32]

        -t, --timeout <value>
        Set the timeout value for when downloading takes to long.
        This may occur with unstable hosts.
        [Default: 15]

        -i, --image-limit <value>
        Do not change this unless you used hath to change to rows.
        [Default: 40]

        -s, --silent
        Silence all output while downloading.
        [Default: False]

        -h, --help
        This.
        """
        exit(0)

# Create the HTMLParser
Parser = HTMLParser()
# Fallback for non-existent -u parameter.
if URL == '':URL = raw_input('Gallery: ')
# The final directory where this batch of images is going to be saved.
MERGED      = merge(OUT_DIR + '/' + TITLE)

# Due to the existence of bigger galleries (duh). We had to go to every page.
# We first count the images and split them through images per page.
# Then use that number to indicate the amount of pages and use url changing-
# to go to every page.
if not SILENT: print '[..] Obtaining pages'
PAGES = int(search('\d+ @', str(urlopen(URL).read())).group(0).replace(' ', '').replace('@', ''))//40
for i in range(PAGES + 1):
    try:
        Parser.feed(urlopen(URL + '?p=%s'%str(i)).read())
    except Exception as e:
        print e, exit(0)
if not SILENT: print '[OK] Succes'

# We get some unique url paths, but ehentai uses a external hentai hosting system.
# Where users are encouraged to host hentai for them in exchange for various benefits.
# This means we have to nest really deep to get the source.
if not SILENT: print '[..] Splitting images from their container'
for U in IMAGE_URLS:
    u = search('http:\/\/\d+\.\d+\.\d+\.\d+[^"]+', urlopen(U).read())
    IMAGES.append(u.group(0))
if not SILENT: print '[OK] Succes'

if not SILENT: print '[..] Making directory (%s)'%TITLE
try:
    mkdir(MERGED)
except Exception as e:
    print e, exit(0)
if not SILENT: print '[OK] Succes'

for ind, image in enumerate(IMAGES):
    if not SILENT:print '%i/%s\t%s\t%s%s'%(ind+1,len(IMAGES),IMAGES[ind][:int(OUTPUT_REDUCTION)],''.join([MERGED,'/',PREFIX,str(ind+1)]),FORMAT)
    with open(MERGED+'/'+PREFIX+str(ind)+FORMAT,"wb")as output:
        try:
            output.write(urlopen(image, timeout=15).read())
        except Exception:
            if not SILENT: print 'A unstable host caused a timeout (%i/%s).'%(ind+1,len(IMAGES))
if not SILENT: print '[OK] Finished succesfully'
	#! /usr/bin/env python
	# Made by 'Mirko van der Waal'
	# Distributed under terms of the MIT license.

	try:
	from HTMLParser import HTMLParser
	from urllib2 import urlopen
	from re import search
	from os import getenv, mkdir
	from os.path import join as merge
	from sys import argv, exit
	from uuid import uuid4

	import getopt

	except ImportError as e:
	print e, exit(0)

	URL = ''
	IMAGE_URLS = []
	IMAGES = []
	PAGE_URLS = 40
	TIMEOUT = 15
	PREFIX = 'image-'
	FORMAT = '.jpg'
	OUTPUT_REDUCTION = 32
	SILENT = False
	TITLE = str(uuid4())[:6]
	HOME_DIR = getenv('HOME')
	OUT_DIR = merge(HOME_DIR + '/Pictures')

	class HTMLParser(HTMLParser):
	def handle_starttag(self, tag, attrs):
	# For every attribute in attributes, we test them to conditions.

	for attr in attrs:
	try:
	# Is the parameter equal to 'href' and does the value match
	# the /s/ regex. /s/ looks for hrefs that contain the images.
	# As the images are stored in this urlpath.
	# Some other paths I know are;
	# /tag/<tagname>, /uploader/<username>, /g/<id>/<id>
	if search('/s/', attr[1]).group(0):
	# Write the URL to a array of images for us to open later.
	IMAGE_URLS.append(attr[1])

	# Occasionally, -most of the time; we we gain NoneType because
	# there is not match. We simply 'pass' this off because it isn't useful.
	except:
	pass

	# The default GETOPT module.
	try:
	opts, args = getopt.getopt(argv[1:],
	':hu:n:p:x:f:o:t:i:s',
	['url=', 'name=', 'path=', 'prefix=', 'format=',
	'output-reduction=', 'timeout=', 'image-limit=', 'silent', 'help'])

	except Exception as e:
	print e, exit(0)

	for o, a in opts:
	if o in ('-u', '--url'): URL = a
	elif o in ('-n', '--name'): TITLE = a
	elif o in ('-p', '--path'): OUT_DIR = a
	elif o in ('-i', '--image-limit'):
	PAGE_URLS = a
	elif o in ('-t', '--timeout'): TIMEOUT = a
	elif o in ('-x', '--prefix'): PREFIX = a
	elif o in ('-f', '--format'): FORMAT = a
	elif o in ('-s', '--silent'): SILENT = True
	elif o in ('-o', '--output-reduction'):
	OUTPUT_REDUCTION = a
	elif o in ('-h', '--help'):
	print """
	-u, --url <value>
	The gallery you want to download, make sure to supply a full url.
	This is the one and only required parameter.
	[Default: Input]

	-n, --name <value>
	The folder name where we extract the content.
	[Default: A UUID4 6 character string.]

	-p, --path <value>
	The path where to extract the Images to.
	[Default: ~/Pictures\|$HOME/Pictures]

	-x, --prefix <value>
	The downloaded Image prefix.
	[Default: image-<index>]

	-f, --format <value>
	The image format to save as.
	[Default: .jpg]

	-o, --output-reduction <value>
	Reduce the length of the original source url when outputting the process.
	[Default: 32]

	-t, --timeout <value>
	Set the timeout value for when downloading takes to long.
	This may occur with unstable hosts.
	[Default: 15]

	-i, --image-limit <value>
	Do not change this unless you used hath to change to rows.
	[Default: 40]

	-s, --silent
	Silence all output while downloading.
	[Default: False]

	-h, --help
	This.
	"""
	exit(0)

	# Create the HTMLParser
	Parser = HTMLParser()
	# Fallback for non-existent -u parameter.
	if URL == '':URL = raw_input('Gallery: ')
	# The final directory where this batch of images is going to be saved.
	MERGED = merge(OUT_DIR + '/' + TITLE)

	# Due to the existence of bigger galleries (duh). We had to go to every page.
	# We first count the images and split them through images per page.
	# Then use that number to indicate the amount of pages and use url changing-
	# to go to every page.
	if not SILENT: print '[..] Obtaining pages'
	PAGES = int(search('\d+ @', str(urlopen(URL).read())).group(0).replace(' ', '').replace('@', ''))//40
	for i in range(PAGES + 1):
	try:
	Parser.feed(urlopen(URL + '?p=%s'%str(i)).read())
	except Exception as e:
	print e, exit(0)
	if not SILENT: print '[OK] Succes'

	# We get some unique url paths, but ehentai uses a external hentai hosting system.
	# Where users are encouraged to host hentai for them in exchange for various benefits.
	# This means we have to nest really deep to get the source.
	if not SILENT: print '[..] Splitting images from their container'
	for U in IMAGE_URLS:
	u = search('http:\/\/\d+\.\d+\.\d+\.\d+[^"]+', urlopen(U).read())
	IMAGES.append(u.group(0))
	if not SILENT: print '[OK] Succes'

	if not SILENT: print '[..] Making directory (%s)'%TITLE
	try:
	mkdir(MERGED)
	except Exception as e:
	print e, exit(0)
	if not SILENT: print '[OK] Succes'

	for ind, image in enumerate(IMAGES):
	if not SILENT:print '%i/%s\t%s\t%s%s'%(ind+1,len(IMAGES),IMAGES[ind][:int(OUTPUT_REDUCTION)],''.join([MERGED,'/',PREFIX,str(ind+1)]),FORMAT)
	with open(MERGED+'/'+PREFIX+str(ind)+FORMAT,"wb")as output:
	try:
	output.write(urlopen(image, timeout=15).read())
	except Exception:
	if not SILENT: print 'A unstable host caused a timeout (%i/%s).'%(ind+1,len(IMAGES))
	if not SILENT: print '[OK] Finished succesfully'