Skip to content

Instantly share code, notes, and snippets.

@melinath
Created September 19, 2012 14:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save melinath/3750027 to your computer and use it in GitHub Desktop.
Save melinath/3750027 to your computer and use it in GitHub Desktop.
Scraping xkcd's click & drag bit by bit.
import json
import os
import time
import requests
ROOT_DIR = "/Users/melinath/Pictures/xkcd"
MAX_QUADRANT_SIZE = 100
NOT_FOUND_PATH = os.path.join(ROOT_DIR, 'not_found.json')
if os.path.exists(NOT_FOUND_PATH):
with open(NOT_FOUND_PATH, 'r') as f:
not_found = set(json.load(f))
else:
not_found = set(())
for quadrant_size in xrange(MAX_QUADRANT_SIZE):
print "Quadrant size: ", quadrant_size + 1
for vdir in ('n', 's'):
for hdir in ('w', 'e'):
for vdist in xrange(quadrant_size):
for hdist in xrange(quadrant_size):
name = "".join((str(vdist + 1), vdir, str(hdist + 1), hdir, '.png'))
path = os.path.join(ROOT_DIR, name)
if not os.path.exists(path) and name not in not_found:
response = requests.get("http://imgs.xkcd.com/clickdrag/{name}".format(name=name))
if response.status_code == 200:
fp = open(path, 'w')
fp.write(response.content)
elif response.status_code == 404:
not_found.add(name)
else:
print response.status_code, name
time.sleep(1)
with open(NOT_FOUND_PATH, 'w') as f:
json.dump(list(not_found), f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment