Skip to content

Instantly share code, notes, and snippets.

Last active June 14, 2018 09:58
Show Gist options
  • Save impshum/73b4fae7375d05588e47f7e4a26fa0dd to your computer and use it in GitHub Desktop.
Save impshum/73b4fae7375d05588e47f7e4a26fa0dd to your computer and use it in GitHub Desktop.
Download all xkcd comics
# Create a directory called comics next to the script first
import requests
n = 933
while True:
if n == 404:
n += 1
url = '{}/info.0.json'.format(n)
r = requests.get(url)
if r.status_code == 404:
d = r.json()
n = d['num']
t = d['safe_title']
u = d['img']
print('{}: {}'.format(n, t))
if u.endswith('.png'):
ext = 'png'
if u.endswith('.jpg'):
ext = 'jpg'
if u.endswith('.gif'):
ext = 'gif'
x = 'comics/{}-{}.{}'.format(n, t.replace(' ', '_').replace('/', '-'), ext)
with open(x, "wb") as f:
c = requests.get(u)
n += 1
Copy link

impshum commented Jun 2, 2018

Test it out man. I had to fix the 404 error/comic thing and the file types. I'll find a smaller way to do this when I do. For now it works from start to finish... BAM!

Copy link

4lpha0ne commented Jun 5, 2018

8 LOC:

import requests
lr = n = 1
while True:
    r = requests.get('{}/info.0.json'.format(n))
    if r.status_code != 404 and r.json()['img'][-4]=='.':
        with open('comics/{}-{}.{}'.format(r.json()['num'], ''.join(['_' if c in '\\/`*{}[]()<>#+!?:' else c for c in r.json()['safe_title']]), r.json()['img'][-3:]), "wb") as f: f.write(requests.get(r.json()['img']).content)
    elif lr==404: break
    lr=r.status_code ; n += 1

Long version, with makedir:

import os                        # for mkdir                  -> remove for 12 line version
if not os.path.exists('comics'): # check for dir's existence  -> remove for 12 line version
    os.makedirs('comics')        # make dir, if needed        -> remove for 12 line version
import requests
last_status = n = 0              # init n and last request status code
while True:
    r = requests.get('{}/info.0.json'.format(n)) # get page
    if r.status_code != 404 and r.json()['img'][-4]=='.': # check for status code and if there is a dot, which indicates a typical img filename ending
        d = r.json()             # parse
        print('{}: {}'.format(d['num'], d['safe_title'])) # print id + title -> remove for 12 line version
        with open('comics/{}-{}.{}'.format(d['num'], # a bit dense ;) create path
                  ''.join(['_' if c in '\\/`*{}[]()<>#+!?:' else c for c in d['safe_title']]), #replace unwanted chars
                  d['img'][-3:]), "wb") as f: # get extension from json img info, open file
            f.write(requests.get(d['img']).content) # write the content received from json img path
    elif last_status == 404:     # end condition: stop if we find a 2nd #404 error (there are no more pages)
    last_status = r.status_code  # remember last status code
    n += 1                       # next one, please

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment