Skip to content

Instantly share code, notes, and snippets.

Last active September 4, 2022 07:04
Show Gist options
  • Save Ennea/92fa14fde3f466837467a5a99382cf52 to your computer and use it in GitHub Desktop.
Save Ennea/92fa14fde3f466837467a5a99382cf52 to your computer and use it in GitHub Desktop.
Download images for old Pixelation threads from the Internet Archive. Not functional anymore due to changes, sadly :(
import argparse
import os
import re
import urllib2
# argparse. only argument is the thread id, but this is useful regardless
parser = argparse.ArgumentParser(description="Grab thread images from snapshots.")
parser.add_argument("thread_id", type=int, help="the thread id to get the images for")
args = parser.parse_args()
# create an opener using urllib2, so we can have a more or less legit user agent
opener = urllib2.build_opener()
opener.addheaders = [("User-agent", "Mozilla/5.0")]
# see if we can actually find any snapshots
print "Looking for snapshots..."
u ="*/" + str(args.thread_id) + ".0")
s =
r = re.findall(r'"date captures"[\s\S]+?<a href="(.+?)"', s)
# decide on a snapshot to use
# WIP: currently using the latest one. might not be the best idea
if len(r) > 0:
print "Snapshot(s) found! Using latest one available"
snap = r[len(r) - 1]
print "No snapshots. Exiting.."
# open up the snap. oh snap!
print "\nLoading page 0.."
u ="" + snap)
s =
# parse page nav; we want a list of pages that need traversing right away
r ='<div class="pagelinks floatleft">(.*?)</div>', s)
r = re.findall(r'href=".*?\.(\d+)"',
page_step = int(r[0])
page_max = int(r[len(r) - 1])
pages = range(0, page_max + page_step, page_step)
# time to start grabbing images
images = []
# img_index = grab_images(args.thread_id, img_index)
for i in pages:
# we already have the HTML for the first page, no need to load it again
if i > 0:
print "Loading page %d.." % i
u ="" + snap[:-1] + str(i))
s =
r = re.findall(r'<img src="(\S+)" class=".*?bbc_img"', s)
images += r
# we (hopefully) got them all. save em!
# try to create the directory first
# exists already, simply pass
except OSError:
for i in images:
print "Downloading %s.." % i
u ="" + i)
f = open("%d/%d_%s" % (args.thread_id, args.thread_id, i.split("/")[-1]), "wb+")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment