Last active
April 30, 2021 23:07
-
-
Save auscompgeek/5218149 to your computer and use it in GitHub Desktop.
A forwards and backwards compatible xkcd downloader written in Python.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# This Source Code Form is subject to the terms of the Mozilla Public | |
# License, v. 2.0. If a copy of the MPL was not distributed with this | |
# file, You can obtain one at http://mozilla.org/MPL/2.0/. | |
"""An xkcd downloader by auscompgeek. | |
This Python script will download all xkcd comics within the range specified. | |
Both Python 2.x and 3k are supported. Tested on Python 2.7.3 and 3.2.1. | |
Comics will be saved in the format "<id>-<name>.<img-ext>". | |
The title and mouseover text will also be saved in "<id>-<name>.txt". | |
""" | |
# 2to3 switches to test: | |
## -f all -f idioms -f buffer -f set_literal -f ws_comma -x urllib -p | |
# urllib: line 49, tests for Py3k | |
# ====BEGIN CONFIGURATION SECTION==== | |
# If you use Python < 2.6, you should comment this future statement. | |
from __future__ import print_function, unicode_literals | |
# Should we use SSL to download the comic info and images? | |
USE_SSL = False | |
# Where should we split the alt text into newlines? (regexp) | |
ALT_NEWLINE_SPLITS = [ | |
' ', | |
#' / ', | |
#r'(?<') (?>')', | |
#r'(?<.) ', | |
] | |
# =====END CONFIGURATION SECTION===== | |
# The XKCD comic info URL, where %d is the comic ID. | |
XKCD_INFO_URL = 'http%s://xkcd.com/%%d/info.0.json' % ('s' if USE_SSL else '') | |
# The XKCD comic image URL, where %s is the comic filename. | |
XKCD_IMG_URL = 'http%s://sslimgs.xkcd.com/comics/%%s' % ('s' if USE_SSL else '') | |
import codecs, re, os, sys | |
# Try to import urlopen and HTTPError in a compatible way. | |
if sys.version_info[0] < 3: | |
try: | |
from urllib2 import urlopen, HTTPError | |
except Exception: # ancient Python? | |
from urllib import urlopen | |
HTTPError = None | |
else: | |
from urllib.request import urlopen, HTTPError | |
# Import json, falling back to simplejson if it isn't available. | |
try: | |
import json | |
except Exception: | |
import simplejson as json | |
def main(): | |
# universal read input (Python 2 and 3 compatibility) | |
read = raw_input if sys.version_info[0] < 3 else input | |
start_comic = int(sys.argv[1] if len(sys.argv) > 1 else | |
read('Enter first comic ID to download (1): ') or 1) | |
end_comic = int(sys.argv[2] if len(sys.argv) > 2 else | |
read('Enter last comic ID to download (%d): ' % | |
start_comic) or start_comic) | |
for i in range(start_comic, end_comic + 1): | |
if i == 404: | |
print("Skipping 404 (it's a 404).") | |
continue | |
try: | |
print('Getting xkcd %d...' % i) | |
get_xkcd(i) | |
except HTTPError: | |
ex = sys.exc_info()[1] | |
if ex.code == 404: | |
sys.stderr.write('xkcd %d is 404. Stop.\n' % i) | |
return | |
sys.stderr.write('xkcd %d: HTTPError: %d %s\n' % (i, ex.code, | |
ex.reason)) | |
except Exception: | |
sys.stderr.write('Error while getting xkcd %d: %r\n' % | |
(i, sys.exc_info()[1])) | |
def get_xkcd(num): | |
"""Fetch the specified xkcd comic.""" | |
try: | |
info_file = urlopen(XKCD_INFO_URL % num) | |
info = info_file.read().decode() | |
finally: | |
info_file.close() | |
info = json.loads(info) | |
save_xkcd(num, info['img'], info['title'], info['alt'], info['year'], info['month'], info['day']) | |
# Some comics are linked to a larger version of the comic. Save these. | |
link = info['link'] | |
if link.startswith('http://imgs.xkcd.com/comics/'): | |
save_xkcd_image(num, link.split('/')[-1]) | |
def save_xkcd(num, img, title, alt, year, month, day): | |
"""Save the specified xkcd comic. Called by get_xkcd().""" | |
filename = img.split('/')[-1] # http://imgs.xkcd.com/comics/... | |
name = filename.split('.', 1)[0] | |
print('Saving "%s"...' % title) | |
save_xkcd_image(num, filename) | |
for pattern in ALT_NEWLINE_SPLITS: | |
alt = re.sub(pattern, os.linesep, alt) | |
txt_file = codecs.open('%d-%s.txt' % (num, name), 'w', 'utf-8') | |
txt_file.write('%s (%s-%s-%s)%s%s' % (title, year, month, day, os.linesep * 2, alt)) | |
txt_file.close() | |
def save_xkcd_image(num, filename): | |
"""Save the specified xkcd comic image.""" | |
print('Downloading "%s"...' % filename) | |
img = urlopen(XKCD_IMG_URL % filename) | |
img_file = open('%d-%s' % (num, filename), 'wb') | |
img_file.write(img.read()) | |
img_file.close() | |
if __name__ == '__main__': | |
main() |
r4: use JSON API; stop on 404; (mostly) conform to PEP 8
r5: reduce carcinogenicity
r6: special-case comic 404 on @keepcalm444's request
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
r3: implement HTMLParser