Skip to content

Instantly share code, notes, and snippets.

@impshum
Last active June 14, 2018 09:58
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save impshum/73b4fae7375d05588e47f7e4a26fa0dd to your computer and use it in GitHub Desktop.
Save impshum/73b4fae7375d05588e47f7e4a26fa0dd to your computer and use it in GitHub Desktop.
Download all xkcd comics
# Create a directory called comics next to the script first
import requests
n = 933
while True:
if n == 404:
n += 1
url = 'https://xkcd.com/{}/info.0.json'.format(n)
r = requests.get(url)
if r.status_code == 404:
break
d = r.json()
n = d['num']
t = d['safe_title']
u = d['img']
print('{}: {}'.format(n, t))
if u.endswith('.png'):
ext = 'png'
if u.endswith('.jpg'):
ext = 'jpg'
if u.endswith('.gif'):
ext = 'gif'
x = 'comics/{}-{}.{}'.format(n, t.replace(' ', '_').replace('/', '-'), ext)
with open(x, "wb") as f:
c = requests.get(u)
f.write(c.content)
n += 1
@sdegutis
Copy link

Super cool! You inspired me to give it a try too! Here's my take in Node.js, same #LOC:

const fs = require('fs'), https = require('https');

const getJSON = url =>
  new Promise(resolve =>
    https.get(url, (response) =>
      response.statusCode === 404 ? {} :
        response
          .on('data', chunk => response.body = (response.body || '') + chunk)
          .on('end', () => resolve(JSON.parse(response.body)))));

async function save(i) {
  const { img, safe_title } = await getJSON(`https://xkcd.com/${i}/info.0.json`);
  console.log(`${i}: ${safe_title}`);
  if (img) {
    const filename = `comics/${i}-${safe_title.replace(' ', '_')}`;
    https.get(img, res => res.pipe(fs.createWriteStream(filename)));
  }
  if (img || i === 404) save(i + 1);
}

save(1);

@impshum
Copy link
Author

impshum commented May 31, 2018

@sdegutis Did you do a full run?

@sdegutis
Copy link

@impshum I did not :/

@impshum
Copy link
Author

impshum commented Jun 2, 2018

Test it out man. I had to fix the 404 error/comic thing and the file types. I'll find a smaller way to do this when I do. For now it works from start to finish... BAM!

@4lpha0ne
Copy link

4lpha0ne commented Jun 5, 2018

8 LOC:

import requests
lr = n = 1
while True:
    r = requests.get('https://xkcd.com/{}/info.0.json'.format(n))
    if r.status_code != 404 and r.json()['img'][-4]=='.':
        with open('comics/{}-{}.{}'.format(r.json()['num'], ''.join(['_' if c in '\\/`*{}[]()<>#+!?:' else c for c in r.json()['safe_title']]), r.json()['img'][-3:]), "wb") as f: f.write(requests.get(r.json()['img']).content)
    elif lr==404: break
    lr=r.status_code ; n += 1

Long version, with makedir:

import os                        # for mkdir                  -> remove for 12 line version
if not os.path.exists('comics'): # check for dir's existence  -> remove for 12 line version
    os.makedirs('comics')        # make dir, if needed        -> remove for 12 line version
import requests
last_status = n = 0              # init n and last request status code
while True:
    r = requests.get('https://xkcd.com/{}/info.0.json'.format(n)) # get page
    if r.status_code != 404 and r.json()['img'][-4]=='.': # check for status code and if there is a dot, which indicates a typical img filename ending
        d = r.json()             # parse
        print('{}: {}'.format(d['num'], d['safe_title'])) # print id + title -> remove for 12 line version
        with open('comics/{}-{}.{}'.format(d['num'], # a bit dense ;) create path
                  ''.join(['_' if c in '\\/`*{}[]()<>#+!?:' else c for c in d['safe_title']]), #replace unwanted chars
                  d['img'][-3:]), "wb") as f: # get extension from json img info, open file
            f.write(requests.get(d['img']).content) # write the content received from json img path
    elif last_status == 404:     # end condition: stop if we find a 2nd #404 error (there are no more pages)
        break
    last_status = r.status_code  # remember last status code
    n += 1                       # next one, please

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment