Last active
February 1, 2018 20:03
-
-
Save kade-robertson/720d43635cf8c0318c88495b833c1abb to your computer and use it in GitHub Desktop.
Archive every single XKCD post. Can take a single integer argument as the starting state if you need to stop mid-way. An "xkcd" folder is created in the directory you run the script from. The root level contains the images, and a metadata folder which keeps a nicely-formatted TXT file and raw JSON.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
import sys | |
import json | |
import shutil | |
import datetime | |
import requests | |
class Comic(object): | |
def __init__(self, data): | |
for k, v in data.items(): | |
self.__dict__[k] = v | |
if k in ('day', 'month', 'year'): | |
self.__dict__[k] = int(v) | |
def __str__(self): | |
pdate = datetime.date(year=self.year, month=self.month, day=self.day) | |
out = 'Title: {}\n'.format(self.title) | |
out += 'Index: {}\n'.format(self.num) | |
out += 'Date: {}\n'.format(pdate.strftime('%Y-%m-%d')) | |
out += 'Alt-Text: {}\n'.format(self.alt) | |
out += 'Permalink: https://xkcd.com/{}/\n'.format(self.num) | |
out += 'Image link: {}\n\n'.format(self.img) | |
out += 'Transcript\n----------\n\n{}'.format(self.transcript) | |
return out | |
def __repr__(self): | |
return json.dumps(self.__dict__, indent = 4) | |
def filename(self, ext=''): | |
safer = self.safe_title.replace('?', '').replace('/', '-') | |
return '{:04d}-{}{}'.format(self.num, safer, ext) | |
base_url = r'https://xkcd.com/{0}/info.0.json' | |
base_dir = os.path.join(os.getcwd(), 'xkcd') | |
meta_dir = os.path.join(base_dir, 'metadata') | |
if not os.path.isdir(base_dir): | |
print('Creating xkcd dir at {}'.format(base_dir)) | |
os.makedirs(base_dir) | |
if not os.path.isdir(meta_dir): | |
print('Creating metadata dir at {}'.format(meta_dir)) | |
os.makedirs(meta_dir) | |
with requests.Session() as sess: | |
comic = 1 if len(sys.argv) == 1 else int(sys.argv[1]) | |
if comic == 404: | |
comic += 1 | |
cdata = sess.get(base_url.format(comic)) | |
print('Starting from comic #{}'.format(comic)) | |
while cdata.status_code != 404: | |
com = Comic(cdata.json()) | |
print('Archiving {}'.format(com.filename())) | |
imgd = sess.get(com.img, stream=True) | |
with open(os.path.join(base_dir, com.filename('.jpg')), 'wb') as imgf: | |
shutil.copyfileobj(imgd.raw, imgf) | |
with open(os.path.join(meta_dir, com.filename('.txt')), 'w', encoding='utf-8') as plainf: | |
plainf.write(str(com)) | |
with open(os.path.join(meta_dir, com.filename('.json')), 'w', encoding='utf-8') as jsonf: | |
jsonf.write(repr(com)) | |
comic += 1 | |
if comic == 404: | |
comic += 1 | |
cdata = sess.get(base_url.format(comic)) | |
print('Archiving complete.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment