Schnouki/manolodownload.py

## manolodownload.py
#!/usr/bin/env python3
#
# This is manolodownload, a simple script that allows one to download comic
# books from the Manolosanctis website (http://www.manolosanctis.com/) and save
# them locally as a .cbz archive, readable by an application like Comix.
#
# This can only download comic books that are already readable on the
# Manolosanctis website (many of them being redistributable under the terms of a
# Creative Commons license), this is in no way intended to harm anybody's
# intellectual property or copyright. This is simply intended to make these
# amazing books accessible to people who do not want to use a Flash reader, or
# who would want to read them on a platform that is not supported by Flash (e.g.
# PowerPC, ARM, many mobile phones...).
#
# Copyright (c) 2010 Thomas Jost
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import httplib2
import os.path
import re
import sys
import xml.etree.ElementTree
import zipfile

VALID_ALBUM_URL = re.compile(r"^http://www.manolosanctis.com/bd/(\d+)/")
ALBUM_METADATA_URL = "http://www.manolosanctis.com/xml/album/{0}"

_http = httplib2.Http()

def handle_url(url):
    # Is this a correct Manolosanctis URL?
    m = VALID_ALBUM_URL.match(url)
    if m is None:
        print("Invalid URL: {0}", file=sys.stderr)
        return

    bd = m.group(1)
    download_album(bd)

def download_album(bd):
    md = get_album_metadata(bd)
    if md is None:
        return

    print("Downloading \"{0}\" by {1} ({2} pages)...".format(md["title"], md["authors"], len(md["pages"])))
    fn = "{1} - {0}.cbz".format(md["title"], md["authors"])
    cbz = zipfile.ZipFile(fn, mode="w")
    dl = 0
    for n in range(len(md["pages"])):
        url = md["pages"][n]
        arcname = "{0:04d}.jpg".format(n+1)
        img = get_album_page(url)
        if img is None:
            print("Could not download page {0}".format(n+1), file=sys.stderr)
        else:
            cbz.writestr(arcname, img)
            dl += 1
            print("Downloaded: [{0} / {1}]".format(n+1, len(md["pages"])), end="\r")
    print("Downloaded {0} pages successfuly!".format(dl))

def get_album_metadata(bd):
    print("Retrieving album metadata...")

    # Read the metadata
    response, content = _http.request(ALBUM_METADATA_URL.format(bd))
    if response.status != 200:
        print("Could not retrieve metadata: got HTTP code {0}.".format(response.status), file=sys.stderr)
        return None

    # Now parse the XML and extract useful stuff
    tree = xml.etree.ElementTree.fromstring(content)
    md = {}
    md["title"] = tree.findtext("album/title")
    md["authors"] = tree.findtext("album/authors")
    pages = [elt.get("src") for elt in tree.findall("album/page")]

    # Add the size in each url
    md["pages"] = ["{0}_l{1}".format(*os.path.splitext(url)) for url in pages]

    return md

def get_album_page(url):
    response, content = _http.request(url)
    if response.status != 200:
        return None
    else:
        return content


if __name__ == '__main__':
    if len(sys.argv) < 2:
        print("Syntax: {0} url1 [url2 ...]".format(sys.argv[0]), file=sys.stderr)
        sys.exit(1)

    for url in sys.argv[1:]:
        handle_url(url)
	#!/usr/bin/env python3
	#
	# This is manolodownload, a simple script that allows one to download comic
	# books from the Manolosanctis website (http://www.manolosanctis.com/) and save
	# them locally as a .cbz archive, readable by an application like Comix.
	#
	# This can only download comic books that are already readable on the
	# Manolosanctis website (many of them being redistributable under the terms of a
	# Creative Commons license), this is in no way intended to harm anybody's
	# intellectual property or copyright. This is simply intended to make these
	# amazing books accessible to people who do not want to use a Flash reader, or
	# who would want to read them on a platform that is not supported by Flash (e.g.
	# PowerPC, ARM, many mobile phones...).
	#
	# Copyright (c) 2010 Thomas Jost
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.

	import httplib2
	import os.path
	import re
	import sys
	import xml.etree.ElementTree
	import zipfile

	VALID_ALBUM_URL = re.compile(r"^http://www.manolosanctis.com/bd/(\d+)/")
	ALBUM_METADATA_URL = "http://www.manolosanctis.com/xml/album/{0}"

	_http = httplib2.Http()

	def handle_url(url):
	# Is this a correct Manolosanctis URL?
	m = VALID_ALBUM_URL.match(url)
	if m is None:
	print("Invalid URL: {0}", file=sys.stderr)
	return

	bd = m.group(1)
	download_album(bd)

	def download_album(bd):
	md = get_album_metadata(bd)
	if md is None:
	return

	print("Downloading \"{0}\" by {1} ({2} pages)...".format(md["title"], md["authors"], len(md["pages"])))
	fn = "{1} - {0}.cbz".format(md["title"], md["authors"])
	cbz = zipfile.ZipFile(fn, mode="w")
	dl = 0
	for n in range(len(md["pages"])):
	url = md["pages"][n]
	arcname = "{0:04d}.jpg".format(n+1)
	img = get_album_page(url)
	if img is None:
	print("Could not download page {0}".format(n+1), file=sys.stderr)
	else:
	cbz.writestr(arcname, img)
	dl += 1
	print("Downloaded: [{0} / {1}]".format(n+1, len(md["pages"])), end="\r")
	print("Downloaded {0} pages successfuly!".format(dl))

	def get_album_metadata(bd):
	print("Retrieving album metadata...")

	# Read the metadata
	response, content = _http.request(ALBUM_METADATA_URL.format(bd))
	if response.status != 200:
	print("Could not retrieve metadata: got HTTP code {0}.".format(response.status), file=sys.stderr)
	return None

	# Now parse the XML and extract useful stuff
	tree = xml.etree.ElementTree.fromstring(content)
	md = {}
	md["title"] = tree.findtext("album/title")
	md["authors"] = tree.findtext("album/authors")
	pages = [elt.get("src") for elt in tree.findall("album/page")]

	# Add the size in each url
	md["pages"] = ["{0}_l{1}".format(*os.path.splitext(url)) for url in pages]

	return md

	def get_album_page(url):
	response, content = _http.request(url)
	if response.status != 200:
	return None
	else:
	return content


	if __name__ == '__main__':
	if len(sys.argv) < 2:
	print("Syntax: {0} url1 [url2 ...]".format(sys.argv[0]), file=sys.stderr)
	sys.exit(1)

	for url in sys.argv[1:]:
	handle_url(url)