Skip to content

Instantly share code, notes, and snippets.

@auscompgeek
Last active April 30, 2021 23:07
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save auscompgeek/5218149 to your computer and use it in GitHub Desktop.
Save auscompgeek/5218149 to your computer and use it in GitHub Desktop.
A forwards and backwards compatible xkcd downloader written in Python.
#!/usr/bin/env python
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""An xkcd downloader by auscompgeek.
This Python script will download all xkcd comics within the range specified.
Both Python 2.x and 3k are supported. Tested on Python 2.7.3 and 3.2.1.
Comics will be saved in the format "<id>-<name>.<img-ext>".
The title and mouseover text will also be saved in "<id>-<name>.txt".
"""
# 2to3 switches to test:
## -f all -f idioms -f buffer -f set_literal -f ws_comma -x urllib -p
# urllib: line 49, tests for Py3k
# ====BEGIN CONFIGURATION SECTION====
# If you use Python < 2.6, you should comment this future statement.
from __future__ import print_function, unicode_literals
# Should we use SSL to download the comic info and images?
USE_SSL = False
# Where should we split the alt text into newlines? (regexp)
ALT_NEWLINE_SPLITS = [
' ',
#' / ',
#r'(?<') (?>')',
#r'(?<.) ',
]
# =====END CONFIGURATION SECTION=====
# The XKCD comic info URL, where %d is the comic ID.
XKCD_INFO_URL = 'http%s://xkcd.com/%%d/info.0.json' % ('s' if USE_SSL else '')
# The XKCD comic image URL, where %s is the comic filename.
XKCD_IMG_URL = 'http%s://sslimgs.xkcd.com/comics/%%s' % ('s' if USE_SSL else '')
import codecs, re, os, sys
# Try to import urlopen and HTTPError in a compatible way.
if sys.version_info[0] < 3:
try:
from urllib2 import urlopen, HTTPError
except Exception: # ancient Python?
from urllib import urlopen
HTTPError = None
else:
from urllib.request import urlopen, HTTPError
# Import json, falling back to simplejson if it isn't available.
try:
import json
except Exception:
import simplejson as json
def main():
# universal read input (Python 2 and 3 compatibility)
read = raw_input if sys.version_info[0] < 3 else input
start_comic = int(sys.argv[1] if len(sys.argv) > 1 else
read('Enter first comic ID to download (1): ') or 1)
end_comic = int(sys.argv[2] if len(sys.argv) > 2 else
read('Enter last comic ID to download (%d): ' %
start_comic) or start_comic)
for i in range(start_comic, end_comic + 1):
if i == 404:
print("Skipping 404 (it's a 404).")
continue
try:
print('Getting xkcd %d...' % i)
get_xkcd(i)
except HTTPError:
ex = sys.exc_info()[1]
if ex.code == 404:
sys.stderr.write('xkcd %d is 404. Stop.\n' % i)
return
sys.stderr.write('xkcd %d: HTTPError: %d %s\n' % (i, ex.code,
ex.reason))
except Exception:
sys.stderr.write('Error while getting xkcd %d: %r\n' %
(i, sys.exc_info()[1]))
def get_xkcd(num):
"""Fetch the specified xkcd comic."""
try:
info_file = urlopen(XKCD_INFO_URL % num)
info = info_file.read().decode()
finally:
info_file.close()
info = json.loads(info)
save_xkcd(num, info['img'], info['title'], info['alt'], info['year'], info['month'], info['day'])
# Some comics are linked to a larger version of the comic. Save these.
link = info['link']
if link.startswith('http://imgs.xkcd.com/comics/'):
save_xkcd_image(num, link.split('/')[-1])
def save_xkcd(num, img, title, alt, year, month, day):
"""Save the specified xkcd comic. Called by get_xkcd()."""
filename = img.split('/')[-1] # http://imgs.xkcd.com/comics/...
name = filename.split('.', 1)[0]
print('Saving "%s"...' % title)
save_xkcd_image(num, filename)
for pattern in ALT_NEWLINE_SPLITS:
alt = re.sub(pattern, os.linesep, alt)
txt_file = codecs.open('%d-%s.txt' % (num, name), 'w', 'utf-8')
txt_file.write('%s (%s-%s-%s)%s%s' % (title, year, month, day, os.linesep * 2, alt))
txt_file.close()
def save_xkcd_image(num, filename):
"""Save the specified xkcd comic image."""
print('Downloading "%s"...' % filename)
img = urlopen(XKCD_IMG_URL % filename)
img_file = open('%d-%s' % (num, filename), 'wb')
img_file.write(img.read())
img_file.close()
if __name__ == '__main__':
main()
@auscompgeek
Copy link
Author

r3: implement HTMLParser

@auscompgeek
Copy link
Author

r4: use JSON API; stop on 404; (mostly) conform to PEP 8

@auscompgeek
Copy link
Author

r5: reduce carcinogenicity

@auscompgeek
Copy link
Author

r6: special-case comic 404 on @keepcalm444's request

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment