Last active
October 13, 2019 15:51
-
-
Save vyamkovyi/a7ddf842c19a1ba679f5ebae7a0703e8 to your computer and use it in GitHub Desktop.
e621 pool downloader, python 2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# DO WHAT THE HECK YOU WANT TO PUBLIC LICENSE | |
# Version 4, October 2019 | |
# | |
# Copyright (C) 2004 Sam Hocevar <sam@hocevar.net> | |
# Copyright (C) 2013 Ben McGinnes <ben@adversary.org> | |
# Copyright (C) 2019 Hexawolf <hexawolfie@gmail.com> | |
# | |
# Everyone is permitted to copy and distribute verbatim or | |
# modified copies of this license document, and changing it is | |
# allowed as long as the name is changed. | |
# | |
# DO WHAT THE HECK YOU WANT TO PUBLIC LICENSE | |
# TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION | |
# | |
# 0. You just do anything you want to with this work. | |
# | |
# 1. Do not hold the author(s), creator(s), developer(s) or | |
# distributor(s) liable for anything that happens with your | |
# use of the work. | |
from __future__ import print_function | |
import os | |
import sys | |
import re | |
import base64 | |
import socket | |
import urllib2 | |
import xml.dom.minidom | |
import string | |
import hashlib | |
import math | |
import traceback | |
import unicodedata | |
__version__ = '0.1' | |
if len(sys.argv) < 1: | |
print("You must write a pool ID") | |
sys.exit(1) | |
poolID = sys.argv[1] | |
post_index = 1 | |
downloaded = 0 | |
socket.setdefaulttimeout(15) | |
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) | |
opener.addheaders = [('User-agent', 'Firefox')] | |
urllib2.install_opener(opener) | |
c_retries = 5 | |
poolName = '' | |
postLimit = 24.0 # How many posts are displayed per page | |
# Search for the pool | |
poolURL = 'http://e621.net/pool/show.xml?id=%s' % (poolID) | |
def checkMD5(filename, realMD5): | |
"""Calculates the md5 of the file and compares it with realMD5""" | |
# Open the file for reading | |
f = open(filename, 'rb') | |
hashlib_fileMD5 = hashlib.md5(f.read()) | |
fileMD5 = hashlib_fileMD5.hexdigest().decode('utf-8') | |
if fileMD5 != realMD5: | |
print("md5 hashes do not match. Download failed or file is corrupt.") | |
print(str(fileMD5) + ", " + str(realMD5)) | |
return False | |
return True | |
def get_valid_filename(s): | |
value = s | |
value = unicodedata.normalize('NFKD', value) | |
value = value.encode('ascii', 'ignore') | |
value = unicode(value.strip()) | |
value = unicode(value.replace(' ', '_')) | |
value = unicode(re.sub(r'(?u)[^-\w.]', '', value)) | |
return value | |
def get_hash_string(s): | |
s = s.encode('utf-8') | |
hash = base64.urlsafe_b64encode(s) | |
hash = hash[:10] # truncate | |
hash = unicode(hash) | |
return hash | |
def downloadPosts(posts): | |
""" | |
Looks for the url of the file in each post and downloads it | |
""" | |
destinationFolder = r'.' | |
global poolName | |
global c_retries | |
previousDir = os.getcwd() | |
# Create the folder in which files will be downloaded if id does not exists | |
# and move to it | |
if not os.path.exists(destinationFolder): | |
os.mkdir(destinationFolder) | |
os.chdir(destinationFolder) | |
poolFolder = os.path.join(os.getcwd(), poolName) | |
if not os.path.exists(poolFolder): | |
os.mkdir(poolFolder) | |
os.chdir(poolFolder) | |
# Download each file | |
for post in posts: | |
print("") | |
try: | |
for trial in range(0, c_retries): | |
try: | |
getFile(post) | |
# it succeeded, so it's ok to skip the trials and go to the | |
# next post | |
break | |
except KeyboardInterrupt: | |
raise | |
except Exception as err: | |
print("Download and/or save failed") | |
if(trial + 1 < c_retries): | |
print('Re-attempting download. re-attempt # {t} of \ | |
{retries}. \n'.format(t=(trial+1), | |
retries=(c_retries-1))) | |
continue | |
raise | |
except KeyboardInterrupt: | |
raise | |
except Exception as err: | |
print("Cannot download file") | |
print(traceback.format_exc()) | |
finally: | |
global post_index | |
post_index += 1 | |
os.chdir(previousDir) | |
def getFile(post, force_download=False): | |
"""Gets the file from the post""" | |
# Parse API XML. Get the url to download the file from. | |
urlNodes = post.getElementsByTagName('file_url') | |
urlNode = urlNodes.item(0) | |
url = urlNode.firstChild.nodeValue | |
md5 = post.getElementsByTagName('md5').item(0).firstChild.nodeValue | |
rating = post.getElementsByTagName('rating').item(0).firstChild.nodeValue | |
id = post.getElementsByTagName('id').item(0).firstChild.nodeValue | |
tags = post.getElementsByTagName('tags').item(0).firstChild.nodeValue | |
width = post.getElementsByTagName('width').item(0).firstChild.nodeValue | |
height = post.getElementsByTagName('height').item(0).firstChild.nodeValue | |
file_size = int(post.getElementsByTagName('file_size').item(0).firstChild.nodeValue) | |
if rating == 's': | |
rating = 'safe' | |
elif rating == 'q': | |
rating = 'questionable' | |
else: | |
rating = 'explicit' | |
global poolName | |
global post_index | |
temp = string.Template(r'${name}_${pos}') | |
name = temp.substitute(pos=post_index, id=id, md5=md5, | |
tags=tags, rating=rating, | |
w=width, h=height, name=poolName) | |
extension = os.path.splitext(url)[1] | |
fullName = get_valid_filename(u''.join([name, extension])) | |
if file_size > 1024: | |
file_size = file_size/1024 | |
file_size = '%d KB' % file_size | |
else: | |
file_size = '%d B' % file_size | |
print("Downloading " + url + ' (' + file_size + ')... \n', end='') | |
if not force_download: | |
if os.path.exists(fullName): | |
msg = "file already exists" | |
if checkMD5(fullName, md5): | |
msg += ", but md5 hashes do not match. Re-downloading." | |
print(msg) | |
else: | |
msg += ". Skipping" | |
print (msg) | |
return | |
print("Saving as " + fullName) | |
outFile = open(fullName, 'wb') | |
try: | |
file = urllib2.urlopen(url) | |
data = file.read() | |
outFile.write(data) | |
finally: | |
outFile.close() | |
if not checkMD5(fullName, md5): | |
raise Exception("md5 hashes do not match") | |
global downloaded | |
downloaded += 1 | |
print('Finished.') | |
connectionSuccess = False | |
attempt = 1 | |
while (attempt <= c_retries): | |
try: | |
results = urllib2.urlopen(poolURL) | |
connectionSuccess = True | |
break | |
except Exception as err: | |
print('could not connect to site: {err}. Attempt {attempt} of \ | |
{retries}'.format(err=err, attempt=attempt, retries=c_retries)) | |
attempt += 1 | |
if not connectionSuccess: | |
sys.exit(1) | |
# Parse the returned XML | |
dom = xml.dom.minidom.parse(results) | |
info = dom.getElementsByTagName('pool') | |
for i in info: | |
ats = dict(i.attributes.items()) | |
postCount = float(ats.get('post_count')) | |
name = ats.get('name') | |
safe_name = get_valid_filename(name) | |
# sometimes the safe name is empty, so add this | |
name_hash = get_hash_string(name) | |
poolName = unicode("_".join([safe_name, name_hash])) | |
# How many pages does the pool have | |
totalPages = math.ceil(postCount / postLimit) | |
print ("postCount: " + str(postCount) + " , totalPages: "+str(totalPages)) | |
print('{num} posts found in pool "{pool}"'.format(num=postCount, | |
pool=poolName)) | |
page = 1.0 # Position in the results | |
# Get the posts from each page and move to the next one | |
while True: | |
# Get the posts in this page and parse them | |
url = poolURL + '&page=%d' % (page) | |
print("\n" + url) | |
results = 0 | |
connectionSuccess = False | |
attempt = 1 | |
while (attempt <= c_retries): | |
try: | |
results = urllib2.urlopen(url) | |
connectionSuccess = True | |
break | |
except Exception as err: | |
print('could not connect to site: {err}. Attempt {attempt} of \ | |
{retries}'.format(err=err, attempt=attempt, | |
retries=c_retries)) | |
attempt += 1 | |
if not connectionSuccess: | |
print('could not connect to site. Aborting.') | |
sys.exit(1) | |
dom = xml.dom.minidom.parse(results) | |
posts = dom.getElementsByTagName('post') | |
# Download the posts in this page and move to the next | |
downloadPosts(posts) | |
page += 1.0 | |
if page > totalPages: | |
break | |
print('\nDownload finished.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment