Skip to content

Instantly share code, notes, and snippets.

@APadierna
Last active November 10, 2015 20:58
Show Gist options
  • Save APadierna/3a2c6ca53611f610a4fd to your computer and use it in GitHub Desktop.
Save APadierna/3a2c6ca53611f610a4fd to your computer and use it in GitHub Desktop.
Simple script which downloads all the XKCD strips listed in the XKCD archive (http://xkcd.com/archive/)
#!/usr/bin/env python
"""
Download all the comic strips listed in the XKCD archive page into the current
directory.
"""
from __future__ import print_function
import re
import sys
import urllib
# for backwards compatibility
if sys.version_info[0] > 2:
import urllib.request as ul
else:
import urllib as ul
def main():
get_xkcd_strips()
def get_xkcd_strips():
"""
Connect to the XKCD index page and return a dict with all the available
comic strips and its URL.
"""
xkdc_index=str(ul.urlopen('http://xkcd.com/archive/').read())
xkcd_strip_pattern = '<a href="/(?P<strip_id>\d+)/" title="(?P<date>[\d-]+)">(?P<title>[\w\s]+)</a><br/>'
xkcd_strips_url = re.findall(xkcd_strip_pattern, xkdc_index)
for xkcd_strip in xkcd_strips_url:
get_comic_image('_'.join(xkcd_strip), 'http://xkcd.com/'+xkcd_strip[0])
def get_comic_image(comic_name, comic_url):
print('--\nchecking:', comic_url)
html=str(ul.urlopen(comic_url).read())
strip_pattern = 'http://imgs\.xkcd\.com/comics/[\w\d\.]+'
image_url = re.search(strip_pattern, html)
if image_url:
print('downloading:', image_url.group())
try:
ul.urlretrieve(image_url.group(), comic_name+'.png')
except IOError:
print('Error: Unable to obtain image from:', comic_url.group())
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment