Skip to content

Instantly share code, notes, and snippets.

@muse
Last active August 29, 2015 14:15
Show Gist options
  • Save muse/fcc673e1015a2f98694a to your computer and use it in GitHub Desktop.
Save muse/fcc673e1015a2f98694a to your computer and use it in GitHub Desktop.
Download all the Images from a ehentai /g/allery.
#! /usr/bin/env python
# Made by 'Mirko van der Waal'
# Distributed under terms of the MIT license.
try:
from HTMLParser import HTMLParser
from urllib2 import urlopen
from re import search
from os import getenv, mkdir
from os.path import join as merge
from sys import argv, exit
from uuid import uuid4
import getopt
except ImportError as e:
print e, exit(0)
URL = ''
IMAGE_URLS = []
IMAGES = []
PAGE_URLS = 40
TIMEOUT = 15
PREFIX = 'image-'
FORMAT = '.jpg'
OUTPUT_REDUCTION = 32
SILENT = False
TITLE = str(uuid4())[:6]
HOME_DIR = getenv('HOME')
OUT_DIR = merge(HOME_DIR + '/Pictures')
class HTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
# For every attribute in attributes, we test them to conditions.
for attr in attrs:
try:
# Is the parameter equal to 'href' and does the value match
# the /s/ regex. /s/ looks for hrefs that contain the images.
# As the images are stored in this urlpath.
# Some other paths I know are;
# /tag/<tagname>, /uploader/<username>, /g/<id>/<id>
if search('/s/', attr[1]).group(0):
# Write the URL to a array of images for us to open later.
IMAGE_URLS.append(attr[1])
# Occasionally, -most of the time; we we gain NoneType because
# there is not match. We simply 'pass' this off because it isn't useful.
except:
pass
# The default GETOPT module.
try:
opts, args = getopt.getopt(argv[1:],
':hu:n:p:x:f:o:t:i:s',
['url=', 'name=', 'path=', 'prefix=', 'format=',
'output-reduction=', 'timeout=', 'image-limit=', 'silent', 'help'])
except Exception as e:
print e, exit(0)
for o, a in opts:
if o in ('-u', '--url'): URL = a
elif o in ('-n', '--name'): TITLE = a
elif o in ('-p', '--path'): OUT_DIR = a
elif o in ('-i', '--image-limit'):
PAGE_URLS = a
elif o in ('-t', '--timeout'): TIMEOUT = a
elif o in ('-x', '--prefix'): PREFIX = a
elif o in ('-f', '--format'): FORMAT = a
elif o in ('-s', '--silent'): SILENT = True
elif o in ('-o', '--output-reduction'):
OUTPUT_REDUCTION = a
elif o in ('-h', '--help'):
print """
-u, --url <value>
The gallery you want to download, make sure to supply a full url.
This is the one and only required parameter.
[Default: Input]
-n, --name <value>
The folder name where we extract the content.
[Default: A UUID4 6 character string.]
-p, --path <value>
The path where to extract the Images to.
[Default: ~/Pictures|$HOME/Pictures]
-x, --prefix <value>
The downloaded Image prefix.
[Default: image-<index>]
-f, --format <value>
The image format to save as.
[Default: .jpg]
-o, --output-reduction <value>
Reduce the length of the original source url when outputting the process.
[Default: 32]
-t, --timeout <value>
Set the timeout value for when downloading takes to long.
This may occur with unstable hosts.
[Default: 15]
-i, --image-limit <value>
Do not change this unless you used hath to change to rows.
[Default: 40]
-s, --silent
Silence all output while downloading.
[Default: False]
-h, --help
This.
"""
exit(0)
# Create the HTMLParser
Parser = HTMLParser()
# Fallback for non-existent -u parameter.
if URL == '':URL = raw_input('Gallery: ')
# The final directory where this batch of images is going to be saved.
MERGED = merge(OUT_DIR + '/' + TITLE)
# Due to the existence of bigger galleries (duh). We had to go to every page.
# We first count the images and split them through images per page.
# Then use that number to indicate the amount of pages and use url changing-
# to go to every page.
if not SILENT: print '[..] Obtaining pages'
PAGES = int(search('\d+ @', str(urlopen(URL).read())).group(0).replace(' ', '').replace('@', ''))//40
for i in range(PAGES + 1):
try:
Parser.feed(urlopen(URL + '?p=%s'%str(i)).read())
except Exception as e:
print e, exit(0)
if not SILENT: print '[OK] Succes'
# We get some unique url paths, but ehentai uses a external hentai hosting system.
# Where users are encouraged to host hentai for them in exchange for various benefits.
# This means we have to nest really deep to get the source.
if not SILENT: print '[..] Splitting images from their container'
for U in IMAGE_URLS:
u = search('http:\/\/\d+\.\d+\.\d+\.\d+[^"]+', urlopen(U).read())
IMAGES.append(u.group(0))
if not SILENT: print '[OK] Succes'
if not SILENT: print '[..] Making directory (%s)'%TITLE
try:
mkdir(MERGED)
except Exception as e:
print e, exit(0)
if not SILENT: print '[OK] Succes'
for ind, image in enumerate(IMAGES):
if not SILENT:print '%i/%s\t%s\t%s%s'%(ind+1,len(IMAGES),IMAGES[ind][:int(OUTPUT_REDUCTION)],''.join([MERGED,'/',PREFIX,str(ind+1)]),FORMAT)
with open(MERGED+'/'+PREFIX+str(ind)+FORMAT,"wb")as output:
try:
output.write(urlopen(image, timeout=15).read())
except Exception:
if not SILENT: print 'A unstable host caused a timeout (%i/%s).'%(ind+1,len(IMAGES))
if not SILENT: print '[OK] Finished succesfully'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment