Skip to content

Instantly share code, notes, and snippets.

@kade-robertson
Last active February 1, 2018 20:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kade-robertson/720d43635cf8c0318c88495b833c1abb to your computer and use it in GitHub Desktop.
Save kade-robertson/720d43635cf8c0318c88495b833c1abb to your computer and use it in GitHub Desktop.
Archive every single XKCD post. Can take a single integer argument as the starting state if you need to stop mid-way. An "xkcd" folder is created in the directory you run the script from. The root level contains the images, and a metadata folder which keeps a nicely-formatted TXT file and raw JSON.
#!/usr/bin/env python3
import os
import sys
import json
import shutil
import datetime
import requests
class Comic(object):
def __init__(self, data):
for k, v in data.items():
self.__dict__[k] = v
if k in ('day', 'month', 'year'):
self.__dict__[k] = int(v)
def __str__(self):
pdate = datetime.date(year=self.year, month=self.month, day=self.day)
out = 'Title: {}\n'.format(self.title)
out += 'Index: {}\n'.format(self.num)
out += 'Date: {}\n'.format(pdate.strftime('%Y-%m-%d'))
out += 'Alt-Text: {}\n'.format(self.alt)
out += 'Permalink: https://xkcd.com/{}/\n'.format(self.num)
out += 'Image link: {}\n\n'.format(self.img)
out += 'Transcript\n----------\n\n{}'.format(self.transcript)
return out
def __repr__(self):
return json.dumps(self.__dict__, indent = 4)
def filename(self, ext=''):
safer = self.safe_title.replace('?', '').replace('/', '-')
return '{:04d}-{}{}'.format(self.num, safer, ext)
base_url = r'https://xkcd.com/{0}/info.0.json'
base_dir = os.path.join(os.getcwd(), 'xkcd')
meta_dir = os.path.join(base_dir, 'metadata')
if not os.path.isdir(base_dir):
print('Creating xkcd dir at {}'.format(base_dir))
os.makedirs(base_dir)
if not os.path.isdir(meta_dir):
print('Creating metadata dir at {}'.format(meta_dir))
os.makedirs(meta_dir)
with requests.Session() as sess:
comic = 1 if len(sys.argv) == 1 else int(sys.argv[1])
if comic == 404:
comic += 1
cdata = sess.get(base_url.format(comic))
print('Starting from comic #{}'.format(comic))
while cdata.status_code != 404:
com = Comic(cdata.json())
print('Archiving {}'.format(com.filename()))
imgd = sess.get(com.img, stream=True)
with open(os.path.join(base_dir, com.filename('.jpg')), 'wb') as imgf:
shutil.copyfileobj(imgd.raw, imgf)
with open(os.path.join(meta_dir, com.filename('.txt')), 'w', encoding='utf-8') as plainf:
plainf.write(str(com))
with open(os.path.join(meta_dir, com.filename('.json')), 'w', encoding='utf-8') as jsonf:
jsonf.write(repr(com))
comic += 1
if comic == 404:
comic += 1
cdata = sess.get(base_url.format(comic))
print('Archiving complete.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment