auscompgeek/xkcd_download.py

## xkcd_download.py
#!/usr/bin/env python
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

"""An xkcd downloader by auscompgeek.

This Python script will download all xkcd comics within the range specified.

Both Python 2.x and 3k are supported. Tested on Python 2.7.3 and 3.2.1.

Comics will be saved in the format "<id>-<name>.<img-ext>".
The title and mouseover text will also be saved in "<id>-<name>.txt".
"""

# 2to3 switches to test:
## -f all -f idioms -f buffer -f set_literal -f ws_comma -x urllib -p
# urllib: line 49, tests for Py3k

# ====BEGIN CONFIGURATION SECTION====

# If you use Python < 2.6, you should comment this future statement.
from __future__ import print_function, unicode_literals

# Should we use SSL to download the comic info and images?
USE_SSL = False

# Where should we split the alt text into newlines? (regexp)
ALT_NEWLINE_SPLITS = [
    '  ',
    #' / ',
    #r'(?<') (?>')',
    #r'(?<.) ',
]

# =====END CONFIGURATION SECTION=====

# The XKCD comic info URL, where %d is the comic ID.
XKCD_INFO_URL = 'http%s://xkcd.com/%%d/info.0.json' % ('s' if USE_SSL else '')

# The XKCD comic image URL, where %s is the comic filename.
XKCD_IMG_URL = 'http%s://sslimgs.xkcd.com/comics/%%s' % ('s' if USE_SSL else '')

import codecs, re, os, sys

# Try to import urlopen and HTTPError in a compatible way.
if sys.version_info[0] < 3:
    try:
        from urllib2 import urlopen, HTTPError
    except Exception:  # ancient Python?
        from urllib import urlopen
        HTTPError = None
else:
    from urllib.request import urlopen, HTTPError

# Import json, falling back to simplejson if it isn't available.
try:
    import json
except Exception:
    import simplejson as json


def main():
    # universal read input (Python 2 and 3 compatibility)
    read = raw_input if sys.version_info[0] < 3 else input

    start_comic = int(sys.argv[1] if len(sys.argv) > 1 else
                      read('Enter first comic ID to download (1): ') or 1)
    end_comic = int(sys.argv[2] if len(sys.argv) > 2 else
                    read('Enter last comic ID to download (%d): ' %
                         start_comic) or start_comic)

    for i in range(start_comic, end_comic + 1):
        if i == 404:
            print("Skipping 404 (it's a 404).")
            continue
        try:
            print('Getting xkcd %d...' % i)
            get_xkcd(i)
        except HTTPError:
            ex = sys.exc_info()[1]
            if ex.code == 404:
                sys.stderr.write('xkcd %d is 404. Stop.\n' % i)
                return
            sys.stderr.write('xkcd %d: HTTPError: %d %s\n' % (i, ex.code,
                                                              ex.reason))
        except Exception:
            sys.stderr.write('Error while getting xkcd %d: %r\n' %
                             (i, sys.exc_info()[1]))


def get_xkcd(num):
    """Fetch the specified xkcd comic."""

    try:
        info_file = urlopen(XKCD_INFO_URL % num)
        info = info_file.read().decode()
    finally:
        info_file.close()

    info = json.loads(info)

    save_xkcd(num, info['img'], info['title'], info['alt'], info['year'], info['month'], info['day'])

    # Some comics are linked to a larger version of the comic. Save these.
    link = info['link']
    if link.startswith('http://imgs.xkcd.com/comics/'):
        save_xkcd_image(num, link.split('/')[-1])


def save_xkcd(num, img, title, alt, year, month, day):
    """Save the specified xkcd comic.  Called by get_xkcd()."""

    filename = img.split('/')[-1]  # http://imgs.xkcd.com/comics/...
    name = filename.split('.', 1)[0]

    print('Saving "%s"...' % title)
    save_xkcd_image(num, filename)

    for pattern in ALT_NEWLINE_SPLITS:
        alt = re.sub(pattern, os.linesep, alt)

    txt_file = codecs.open('%d-%s.txt' % (num, name), 'w', 'utf-8')
    txt_file.write('%s (%s-%s-%s)%s%s' % (title, year, month, day, os.linesep * 2, alt))
    txt_file.close()


def save_xkcd_image(num, filename):
    """Save the specified xkcd comic image."""

    print('Downloading "%s"...' % filename)
    img = urlopen(XKCD_IMG_URL % filename)

    img_file = open('%d-%s' % (num, filename), 'wb')
    img_file.write(img.read())
    img_file.close()


if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	# This Source Code Form is subject to the terms of the Mozilla Public
	# License, v. 2.0. If a copy of the MPL was not distributed with this
	# file, You can obtain one at http://mozilla.org/MPL/2.0/.

	"""An xkcd downloader by auscompgeek.

	This Python script will download all xkcd comics within the range specified.

	Both Python 2.x and 3k are supported. Tested on Python 2.7.3 and 3.2.1.

	Comics will be saved in the format "<id>-<name>.<img-ext>".
	The title and mouseover text will also be saved in "<id>-<name>.txt".
	"""

	# 2to3 switches to test:
	## -f all -f idioms -f buffer -f set_literal -f ws_comma -x urllib -p
	# urllib: line 49, tests for Py3k

	# ====BEGIN CONFIGURATION SECTION====

	# If you use Python < 2.6, you should comment this future statement.
	from __future__ import print_function, unicode_literals

	# Should we use SSL to download the comic info and images?
	USE_SSL = False

	# Where should we split the alt text into newlines? (regexp)
	ALT_NEWLINE_SPLITS = [
	' ',
	#' / ',
	#r'(?<') (?>')',
	#r'(?<.) ',
	]

	# =====END CONFIGURATION SECTION=====

	# The XKCD comic info URL, where %d is the comic ID.
	XKCD_INFO_URL = 'http%s://xkcd.com/%%d/info.0.json' % ('s' if USE_SSL else '')

	# The XKCD comic image URL, where %s is the comic filename.
	XKCD_IMG_URL = 'http%s://sslimgs.xkcd.com/comics/%%s' % ('s' if USE_SSL else '')

	import codecs, re, os, sys

	# Try to import urlopen and HTTPError in a compatible way.
	if sys.version_info[0] < 3:
	try:
	from urllib2 import urlopen, HTTPError
	except Exception: # ancient Python?
	from urllib import urlopen
	HTTPError = None
	else:
	from urllib.request import urlopen, HTTPError

	# Import json, falling back to simplejson if it isn't available.
	try:
	import json
	except Exception:
	import simplejson as json


	def main():
	# universal read input (Python 2 and 3 compatibility)
	read = raw_input if sys.version_info[0] < 3 else input

	start_comic = int(sys.argv[1] if len(sys.argv) > 1 else
	read('Enter first comic ID to download (1): ') or 1)
	end_comic = int(sys.argv[2] if len(sys.argv) > 2 else
	read('Enter last comic ID to download (%d): ' %
	start_comic) or start_comic)

	for i in range(start_comic, end_comic + 1):
	if i == 404:
	print("Skipping 404 (it's a 404).")
	continue
	try:
	print('Getting xkcd %d...' % i)
	get_xkcd(i)
	except HTTPError:
	ex = sys.exc_info()[1]
	if ex.code == 404:
	sys.stderr.write('xkcd %d is 404. Stop.\n' % i)
	return
	sys.stderr.write('xkcd %d: HTTPError: %d %s\n' % (i, ex.code,
	ex.reason))
	except Exception:
	sys.stderr.write('Error while getting xkcd %d: %r\n' %
	(i, sys.exc_info()[1]))


	def get_xkcd(num):
	"""Fetch the specified xkcd comic."""

	try:
	info_file = urlopen(XKCD_INFO_URL % num)
	info = info_file.read().decode()
	finally:
	info_file.close()

	info = json.loads(info)

	save_xkcd(num, info['img'], info['title'], info['alt'], info['year'], info['month'], info['day'])

	# Some comics are linked to a larger version of the comic. Save these.
	link = info['link']
	if link.startswith('http://imgs.xkcd.com/comics/'):
	save_xkcd_image(num, link.split('/')[-1])


	def save_xkcd(num, img, title, alt, year, month, day):
	"""Save the specified xkcd comic. Called by get_xkcd()."""

	filename = img.split('/')[-1] # http://imgs.xkcd.com/comics/...
	name = filename.split('.', 1)[0]

	print('Saving "%s"...' % title)
	save_xkcd_image(num, filename)

	for pattern in ALT_NEWLINE_SPLITS:
	alt = re.sub(pattern, os.linesep, alt)

	txt_file = codecs.open('%d-%s.txt' % (num, name), 'w', 'utf-8')
	txt_file.write('%s (%s-%s-%s)%s%s' % (title, year, month, day, os.linesep * 2, alt))
	txt_file.close()


	def save_xkcd_image(num, filename):
	"""Save the specified xkcd comic image."""

	print('Downloading "%s"...' % filename)
	img = urlopen(XKCD_IMG_URL % filename)

	img_file = open('%d-%s' % (num, filename), 'wb')
	img_file.write(img.read())
	img_file.close()


	if __name__ == '__main__':
	main()