Skip to content

Instantly share code, notes, and snippets.

@vyamkovyi
Last active October 13, 2019 15:51
Show Gist options
  • Save vyamkovyi/a7ddf842c19a1ba679f5ebae7a0703e8 to your computer and use it in GitHub Desktop.
Save vyamkovyi/a7ddf842c19a1ba679f5ebae7a0703e8 to your computer and use it in GitHub Desktop.
e621 pool downloader, python 2
# DO WHAT THE HECK YOU WANT TO PUBLIC LICENSE
# Version 4, October 2019
#
# Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
# Copyright (C) 2013 Ben McGinnes <ben@adversary.org>
# Copyright (C) 2019 Hexawolf <hexawolfie@gmail.com>
#
# Everyone is permitted to copy and distribute verbatim or
# modified copies of this license document, and changing it is
# allowed as long as the name is changed.
#
# DO WHAT THE HECK YOU WANT TO PUBLIC LICENSE
# TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
#
# 0. You just do anything you want to with this work.
#
# 1. Do not hold the author(s), creator(s), developer(s) or
# distributor(s) liable for anything that happens with your
# use of the work.
from __future__ import print_function
import os
import sys
import re
import base64
import socket
import urllib2
import xml.dom.minidom
import string
import hashlib
import math
import traceback
import unicodedata
__version__ = '0.1'
if len(sys.argv) < 1:
print("You must write a pool ID")
sys.exit(1)
poolID = sys.argv[1]
post_index = 1
downloaded = 0
socket.setdefaulttimeout(15)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
opener.addheaders = [('User-agent', 'Firefox')]
urllib2.install_opener(opener)
c_retries = 5
poolName = ''
postLimit = 24.0 # How many posts are displayed per page
# Search for the pool
poolURL = 'http://e621.net/pool/show.xml?id=%s' % (poolID)
def checkMD5(filename, realMD5):
"""Calculates the md5 of the file and compares it with realMD5"""
# Open the file for reading
f = open(filename, 'rb')
hashlib_fileMD5 = hashlib.md5(f.read())
fileMD5 = hashlib_fileMD5.hexdigest().decode('utf-8')
if fileMD5 != realMD5:
print("md5 hashes do not match. Download failed or file is corrupt.")
print(str(fileMD5) + ", " + str(realMD5))
return False
return True
def get_valid_filename(s):
value = s
value = unicodedata.normalize('NFKD', value)
value = value.encode('ascii', 'ignore')
value = unicode(value.strip())
value = unicode(value.replace(' ', '_'))
value = unicode(re.sub(r'(?u)[^-\w.]', '', value))
return value
def get_hash_string(s):
s = s.encode('utf-8')
hash = base64.urlsafe_b64encode(s)
hash = hash[:10] # truncate
hash = unicode(hash)
return hash
def downloadPosts(posts):
"""
Looks for the url of the file in each post and downloads it
"""
destinationFolder = r'.'
global poolName
global c_retries
previousDir = os.getcwd()
# Create the folder in which files will be downloaded if id does not exists
# and move to it
if not os.path.exists(destinationFolder):
os.mkdir(destinationFolder)
os.chdir(destinationFolder)
poolFolder = os.path.join(os.getcwd(), poolName)
if not os.path.exists(poolFolder):
os.mkdir(poolFolder)
os.chdir(poolFolder)
# Download each file
for post in posts:
print("")
try:
for trial in range(0, c_retries):
try:
getFile(post)
# it succeeded, so it's ok to skip the trials and go to the
# next post
break
except KeyboardInterrupt:
raise
except Exception as err:
print("Download and/or save failed")
if(trial + 1 < c_retries):
print('Re-attempting download. re-attempt # {t} of \
{retries}. \n'.format(t=(trial+1),
retries=(c_retries-1)))
continue
raise
except KeyboardInterrupt:
raise
except Exception as err:
print("Cannot download file")
print(traceback.format_exc())
finally:
global post_index
post_index += 1
os.chdir(previousDir)
def getFile(post, force_download=False):
"""Gets the file from the post"""
# Parse API XML. Get the url to download the file from.
urlNodes = post.getElementsByTagName('file_url')
urlNode = urlNodes.item(0)
url = urlNode.firstChild.nodeValue
md5 = post.getElementsByTagName('md5').item(0).firstChild.nodeValue
rating = post.getElementsByTagName('rating').item(0).firstChild.nodeValue
id = post.getElementsByTagName('id').item(0).firstChild.nodeValue
tags = post.getElementsByTagName('tags').item(0).firstChild.nodeValue
width = post.getElementsByTagName('width').item(0).firstChild.nodeValue
height = post.getElementsByTagName('height').item(0).firstChild.nodeValue
file_size = int(post.getElementsByTagName('file_size').item(0).firstChild.nodeValue)
if rating == 's':
rating = 'safe'
elif rating == 'q':
rating = 'questionable'
else:
rating = 'explicit'
global poolName
global post_index
temp = string.Template(r'${name}_${pos}')
name = temp.substitute(pos=post_index, id=id, md5=md5,
tags=tags, rating=rating,
w=width, h=height, name=poolName)
extension = os.path.splitext(url)[1]
fullName = get_valid_filename(u''.join([name, extension]))
if file_size > 1024:
file_size = file_size/1024
file_size = '%d KB' % file_size
else:
file_size = '%d B' % file_size
print("Downloading " + url + ' (' + file_size + ')... \n', end='')
if not force_download:
if os.path.exists(fullName):
msg = "file already exists"
if checkMD5(fullName, md5):
msg += ", but md5 hashes do not match. Re-downloading."
print(msg)
else:
msg += ". Skipping"
print (msg)
return
print("Saving as " + fullName)
outFile = open(fullName, 'wb')
try:
file = urllib2.urlopen(url)
data = file.read()
outFile.write(data)
finally:
outFile.close()
if not checkMD5(fullName, md5):
raise Exception("md5 hashes do not match")
global downloaded
downloaded += 1
print('Finished.')
connectionSuccess = False
attempt = 1
while (attempt <= c_retries):
try:
results = urllib2.urlopen(poolURL)
connectionSuccess = True
break
except Exception as err:
print('could not connect to site: {err}. Attempt {attempt} of \
{retries}'.format(err=err, attempt=attempt, retries=c_retries))
attempt += 1
if not connectionSuccess:
sys.exit(1)
# Parse the returned XML
dom = xml.dom.minidom.parse(results)
info = dom.getElementsByTagName('pool')
for i in info:
ats = dict(i.attributes.items())
postCount = float(ats.get('post_count'))
name = ats.get('name')
safe_name = get_valid_filename(name)
# sometimes the safe name is empty, so add this
name_hash = get_hash_string(name)
poolName = unicode("_".join([safe_name, name_hash]))
# How many pages does the pool have
totalPages = math.ceil(postCount / postLimit)
print ("postCount: " + str(postCount) + " , totalPages: "+str(totalPages))
print('{num} posts found in pool "{pool}"'.format(num=postCount,
pool=poolName))
page = 1.0 # Position in the results
# Get the posts from each page and move to the next one
while True:
# Get the posts in this page and parse them
url = poolURL + '&page=%d' % (page)
print("\n" + url)
results = 0
connectionSuccess = False
attempt = 1
while (attempt <= c_retries):
try:
results = urllib2.urlopen(url)
connectionSuccess = True
break
except Exception as err:
print('could not connect to site: {err}. Attempt {attempt} of \
{retries}'.format(err=err, attempt=attempt,
retries=c_retries))
attempt += 1
if not connectionSuccess:
print('could not connect to site. Aborting.')
sys.exit(1)
dom = xml.dom.minidom.parse(results)
posts = dom.getElementsByTagName('post')
# Download the posts in this page and move to the next
downloadPosts(posts)
page += 1.0
if page > totalPages:
break
print('\nDownload finished.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment