vyamkovyi/pool.py

## pool.py
#          DO WHAT THE HECK YOU WANT TO PUBLIC LICENSE
#                    Version 4, October 2019
#
# Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
# Copyright (C) 2013 Ben McGinnes <ben@adversary.org>
# Copyright (C) 2019 Hexawolf <hexawolfie@gmail.com>
#
# Everyone is permitted to copy and distribute verbatim or
# modified copies of this license document, and changing it is
# allowed as long as the name is changed.
#
#          DO WHAT THE HECK YOU WANT TO PUBLIC LICENSE
# TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
#
#  0. You just do anything you want to with this work.
#
#  1. Do not hold the author(s), creator(s), developer(s) or
#     distributor(s) liable for anything that happens with your
#     use of the work.

from __future__ import print_function
import os
import sys
import re
import base64
import socket
import urllib2
import xml.dom.minidom
import string
import hashlib
import math
import traceback
import unicodedata

__version__ = '0.1'

if len(sys.argv) < 1:
    print("You must write a pool ID")
    sys.exit(1)

poolID = sys.argv[1]
post_index = 1
downloaded = 0
socket.setdefaulttimeout(15)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
opener.addheaders = [('User-agent', 'Firefox')]
urllib2.install_opener(opener)
c_retries = 5
poolName = ''

postLimit = 24.0  # How many posts are displayed per page

# Search for the pool
poolURL = 'http://e621.net/pool/show.xml?id=%s' % (poolID)


def checkMD5(filename, realMD5):
    """Calculates the md5 of the file and compares it with realMD5"""
    # Open the file for reading
    f = open(filename, 'rb')
    hashlib_fileMD5 = hashlib.md5(f.read())
    fileMD5 = hashlib_fileMD5.hexdigest().decode('utf-8')
    if fileMD5 != realMD5:
        print("md5 hashes do not match.  Download failed or file is corrupt.")
        print(str(fileMD5) + ", " + str(realMD5))
        return False
    return True


def get_valid_filename(s):
    value = s
    value = unicodedata.normalize('NFKD', value)
    value = value.encode('ascii', 'ignore')
    value = unicode(value.strip())
    value = unicode(value.replace(' ', '_'))
    value = unicode(re.sub(r'(?u)[^-\w.]', '', value))
    return value


def get_hash_string(s):
    s = s.encode('utf-8')
    hash = base64.urlsafe_b64encode(s)
    hash = hash[:10]  # truncate
    hash = unicode(hash)
    return hash


def downloadPosts(posts):
    """
    Looks for the url of the file in each post and downloads it
    """

    destinationFolder = r'.'
    global poolName
    global c_retries
    previousDir = os.getcwd()

    # Create the folder in which files will be downloaded if id does not exists
    # and move to it
    if not os.path.exists(destinationFolder):
        os.mkdir(destinationFolder)

    os.chdir(destinationFolder)
    poolFolder = os.path.join(os.getcwd(), poolName)

    if not os.path.exists(poolFolder):
        os.mkdir(poolFolder)
    os.chdir(poolFolder)

    # Download each file
    for post in posts:
        print("")
        try:
            for trial in range(0, c_retries):
                try:
                    getFile(post)
                    # it succeeded, so it's ok to skip the trials and go to the
                    # next post
                    break
                except KeyboardInterrupt:
                    raise
                except Exception as err:
                    print("Download and/or save failed")
                    if(trial + 1 < c_retries):
                        print('Re-attempting download.  re-attempt # {t} of \
                              {retries}. \n'.format(t=(trial+1),
                                                    retries=(c_retries-1)))
                        continue
                    raise
        except KeyboardInterrupt:
            raise
        except Exception as err:
            print("Cannot download file")
            print(traceback.format_exc())
        finally:
            global post_index
            post_index += 1
    os.chdir(previousDir)


def getFile(post, force_download=False):
    """Gets the file from the post"""

    # Parse API XML. Get the url to download the file from.
    urlNodes = post.getElementsByTagName('file_url')
    urlNode = urlNodes.item(0)
    url = urlNode.firstChild.nodeValue

    md5 = post.getElementsByTagName('md5').item(0).firstChild.nodeValue
    rating = post.getElementsByTagName('rating').item(0).firstChild.nodeValue
    id = post.getElementsByTagName('id').item(0).firstChild.nodeValue
    tags = post.getElementsByTagName('tags').item(0).firstChild.nodeValue
    width = post.getElementsByTagName('width').item(0).firstChild.nodeValue
    height = post.getElementsByTagName('height').item(0).firstChild.nodeValue
    file_size = int(post.getElementsByTagName('file_size').item(0).firstChild.nodeValue)

    if rating == 's':
        rating = 'safe'
    elif rating == 'q':
        rating = 'questionable'
    else:
        rating = 'explicit'

    global poolName
    global post_index
    temp = string.Template(r'${name}_${pos}')
    name = temp.substitute(pos=post_index, id=id, md5=md5,
                           tags=tags, rating=rating,
                           w=width, h=height, name=poolName)

    extension = os.path.splitext(url)[1]
    fullName = get_valid_filename(u''.join([name, extension]))

    if file_size > 1024:
        file_size = file_size/1024
        file_size = '%d KB' % file_size

    else:
        file_size = '%d B' % file_size

    print("Downloading " + url + ' (' + file_size + ')... \n', end='')

    if not force_download:
        if os.path.exists(fullName):
            msg = "file already exists"
            if checkMD5(fullName, md5):
                msg += ", but md5 hashes do not match.  Re-downloading."
                print(msg)
            else:
                msg += ".  Skipping"
                print (msg)
                return

    print("Saving as " + fullName)

    outFile = open(fullName, 'wb')

    try:
        file = urllib2.urlopen(url)
        data = file.read()
        outFile.write(data)
    finally:
        outFile.close()

    if not checkMD5(fullName, md5):
        raise Exception("md5 hashes do not match")

    global downloaded
    downloaded += 1
    print('Finished.')


connectionSuccess = False
attempt = 1
while (attempt <= c_retries):
    try:
        results = urllib2.urlopen(poolURL)
        connectionSuccess = True
        break
    except Exception as err:
        print('could not connect to site: {err}.  Attempt {attempt} of \
        {retries}'.format(err=err, attempt=attempt, retries=c_retries))
        attempt += 1

if not connectionSuccess:
    sys.exit(1)

# Parse the returned XML
dom = xml.dom.minidom.parse(results)

info = dom.getElementsByTagName('pool')
for i in info:
    ats = dict(i.attributes.items())

postCount = float(ats.get('post_count'))

name = ats.get('name')
safe_name = get_valid_filename(name)
# sometimes the safe name is empty, so add this
name_hash = get_hash_string(name)
poolName = unicode("_".join([safe_name, name_hash]))

# How many pages does the pool have
totalPages = math.ceil(postCount / postLimit)
print ("postCount: " + str(postCount) + " , totalPages: "+str(totalPages))

print('{num} posts found in pool "{pool}"'.format(num=postCount,
                                                  pool=poolName))

page = 1.0  # Position in the results

# Get the posts from each page and move to the next one
while True:
    # Get the posts in this page and parse them
    url = poolURL + '&page=%d' % (page)

    print("\n" + url)
    results = 0
    connectionSuccess = False
    attempt = 1
    while (attempt <= c_retries):
        try:
            results = urllib2.urlopen(url)
            connectionSuccess = True
            break
        except Exception as err:
            print('could not connect to site: {err}.  Attempt {attempt} of \
                  {retries}'.format(err=err, attempt=attempt,
                                    retries=c_retries))
            attempt += 1

    if not connectionSuccess:
        print('could not connect to site.  Aborting.')
        sys.exit(1)

    dom = xml.dom.minidom.parse(results)
    posts = dom.getElementsByTagName('post')

    # Download the posts in this page and move to the next
    downloadPosts(posts)
    page += 1.0

    if page > totalPages:
        break

print('\nDownload finished.')
	# DO WHAT THE HECK YOU WANT TO PUBLIC LICENSE
	# Version 4, October 2019
	#
	# Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
	# Copyright (C) 2013 Ben McGinnes <ben@adversary.org>
	# Copyright (C) 2019 Hexawolf <hexawolfie@gmail.com>
	#
	# Everyone is permitted to copy and distribute verbatim or
	# modified copies of this license document, and changing it is
	# allowed as long as the name is changed.
	#
	# DO WHAT THE HECK YOU WANT TO PUBLIC LICENSE
	# TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
	#
	# 0. You just do anything you want to with this work.
	#
	# 1. Do not hold the author(s), creator(s), developer(s) or
	# distributor(s) liable for anything that happens with your
	# use of the work.

	from __future__ import print_function
	import os
	import sys
	import re
	import base64
	import socket
	import urllib2
	import xml.dom.minidom
	import string
	import hashlib
	import math
	import traceback
	import unicodedata

	__version__ = '0.1'

	if len(sys.argv) < 1:
	print("You must write a pool ID")
	sys.exit(1)

	poolID = sys.argv[1]
	post_index = 1
	downloaded = 0
	socket.setdefaulttimeout(15)
	opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
	opener.addheaders = [('User-agent', 'Firefox')]
	urllib2.install_opener(opener)
	c_retries = 5
	poolName = ''

	postLimit = 24.0 # How many posts are displayed per page

	# Search for the pool
	poolURL = 'http://e621.net/pool/show.xml?id=%s' % (poolID)


	def checkMD5(filename, realMD5):
	"""Calculates the md5 of the file and compares it with realMD5"""
	# Open the file for reading
	f = open(filename, 'rb')
	hashlib_fileMD5 = hashlib.md5(f.read())
	fileMD5 = hashlib_fileMD5.hexdigest().decode('utf-8')
	if fileMD5 != realMD5:
	print("md5 hashes do not match. Download failed or file is corrupt.")
	print(str(fileMD5) + ", " + str(realMD5))
	return False
	return True


	def get_valid_filename(s):
	value = s
	value = unicodedata.normalize('NFKD', value)
	value = value.encode('ascii', 'ignore')
	value = unicode(value.strip())
	value = unicode(value.replace(' ', '_'))
	value = unicode(re.sub(r'(?u)[^-\w.]', '', value))
	return value


	def get_hash_string(s):
	s = s.encode('utf-8')
	hash = base64.urlsafe_b64encode(s)
	hash = hash[:10] # truncate
	hash = unicode(hash)
	return hash


	def downloadPosts(posts):
	"""
	Looks for the url of the file in each post and downloads it
	"""

	destinationFolder = r'.'
	global poolName
	global c_retries
	previousDir = os.getcwd()

	# Create the folder in which files will be downloaded if id does not exists
	# and move to it
	if not os.path.exists(destinationFolder):
	os.mkdir(destinationFolder)

	os.chdir(destinationFolder)
	poolFolder = os.path.join(os.getcwd(), poolName)

	if not os.path.exists(poolFolder):
	os.mkdir(poolFolder)
	os.chdir(poolFolder)

	# Download each file
	for post in posts:
	print("")
	try:
	for trial in range(0, c_retries):
	try:
	getFile(post)
	# it succeeded, so it's ok to skip the trials and go to the
	# next post
	break
	except KeyboardInterrupt:
	raise
	except Exception as err:
	print("Download and/or save failed")
	if(trial + 1 < c_retries):
	print('Re-attempting download. re-attempt # {t} of \
	{retries}. \n'.format(t=(trial+1),
	retries=(c_retries-1)))
	continue
	raise
	except KeyboardInterrupt:
	raise
	except Exception as err:
	print("Cannot download file")
	print(traceback.format_exc())
	finally:
	global post_index
	post_index += 1
	os.chdir(previousDir)


	def getFile(post, force_download=False):
	"""Gets the file from the post"""

	# Parse API XML. Get the url to download the file from.
	urlNodes = post.getElementsByTagName('file_url')
	urlNode = urlNodes.item(0)
	url = urlNode.firstChild.nodeValue

	md5 = post.getElementsByTagName('md5').item(0).firstChild.nodeValue
	rating = post.getElementsByTagName('rating').item(0).firstChild.nodeValue
	id = post.getElementsByTagName('id').item(0).firstChild.nodeValue
	tags = post.getElementsByTagName('tags').item(0).firstChild.nodeValue
	width = post.getElementsByTagName('width').item(0).firstChild.nodeValue
	height = post.getElementsByTagName('height').item(0).firstChild.nodeValue
	file_size = int(post.getElementsByTagName('file_size').item(0).firstChild.nodeValue)

	if rating == 's':
	rating = 'safe'
	elif rating == 'q':
	rating = 'questionable'
	else:
	rating = 'explicit'

	global poolName
	global post_index
	temp = string.Template(r'${name}_${pos}')
	name = temp.substitute(pos=post_index, id=id, md5=md5,
	tags=tags, rating=rating,
	w=width, h=height, name=poolName)

	extension = os.path.splitext(url)[1]
	fullName = get_valid_filename(u''.join([name, extension]))

	if file_size > 1024:
	file_size = file_size/1024
	file_size = '%d KB' % file_size

	else:
	file_size = '%d B' % file_size

	print("Downloading " + url + ' (' + file_size + ')... \n', end='')

	if not force_download:
	if os.path.exists(fullName):
	msg = "file already exists"
	if checkMD5(fullName, md5):
	msg += ", but md5 hashes do not match. Re-downloading."
	print(msg)
	else:
	msg += ". Skipping"
	print (msg)
	return

	print("Saving as " + fullName)

	outFile = open(fullName, 'wb')

	try:
	file = urllib2.urlopen(url)
	data = file.read()
	outFile.write(data)
	finally:
	outFile.close()

	if not checkMD5(fullName, md5):
	raise Exception("md5 hashes do not match")

	global downloaded
	downloaded += 1
	print('Finished.')


	connectionSuccess = False
	attempt = 1
	while (attempt <= c_retries):
	try:
	results = urllib2.urlopen(poolURL)
	connectionSuccess = True
	break
	except Exception as err:
	print('could not connect to site: {err}. Attempt {attempt} of \
	{retries}'.format(err=err, attempt=attempt, retries=c_retries))
	attempt += 1

	if not connectionSuccess:
	sys.exit(1)

	# Parse the returned XML
	dom = xml.dom.minidom.parse(results)

	info = dom.getElementsByTagName('pool')
	for i in info:
	ats = dict(i.attributes.items())

	postCount = float(ats.get('post_count'))

	name = ats.get('name')
	safe_name = get_valid_filename(name)
	# sometimes the safe name is empty, so add this
	name_hash = get_hash_string(name)
	poolName = unicode("_".join([safe_name, name_hash]))

	# How many pages does the pool have
	totalPages = math.ceil(postCount / postLimit)
	print ("postCount: " + str(postCount) + " , totalPages: "+str(totalPages))

	print('{num} posts found in pool "{pool}"'.format(num=postCount,
	pool=poolName))

	page = 1.0 # Position in the results

	# Get the posts from each page and move to the next one
	while True:
	# Get the posts in this page and parse them
	url = poolURL + '&page=%d' % (page)

	print("\n" + url)
	results = 0
	connectionSuccess = False
	attempt = 1
	while (attempt <= c_retries):
	try:
	results = urllib2.urlopen(url)
	connectionSuccess = True
	break
	except Exception as err:
	print('could not connect to site: {err}. Attempt {attempt} of \
	{retries}'.format(err=err, attempt=attempt,
	retries=c_retries))
	attempt += 1

	if not connectionSuccess:
	print('could not connect to site. Aborting.')
	sys.exit(1)

	dom = xml.dom.minidom.parse(results)
	posts = dom.getElementsByTagName('post')

	# Download the posts in this page and move to the next
	downloadPosts(posts)
	page += 1.0

	if page > totalPages:
	break

	print('\nDownload finished.')