Skip to content

Instantly share code, notes, and snippets.

@Ennea
Last active Sep 4, 2022
Embed
What would you like to do?
Download images for old Pixelation threads from the Internet Archive. Not functional anymore due to archive.org changes, sadly :(
#!/usr/bin/python
import argparse
import os
import re
import urllib2
# argparse. only argument is the thread id, but this is useful regardless
parser = argparse.ArgumentParser(description="Grab thread images from archive.org snapshots.")
parser.add_argument("thread_id", type=int, help="the thread id to get the images for")
args = parser.parse_args()
# create an opener using urllib2, so we can have a more or less legit user agent
opener = urllib2.build_opener()
opener.addheaders = [("User-agent", "Mozilla/5.0")]
# see if we can actually find any snapshots
print "Looking for snapshots..."
u = opener.open("https://web.archive.org/web/*/http://wayofthepixel.net/index.php?topic=" + str(args.thread_id) + ".0")
s = u.read()
u.close()
r = re.findall(r'"date captures"[\s\S]+?<a href="(.+?)"', s)
# decide on a snapshot to use
# WIP: currently using the latest one. might not be the best idea
if len(r) > 0:
print "Snapshot(s) found! Using latest one available"
snap = r[len(r) - 1]
else:
print "No snapshots. Exiting.."
exit()
# open up the snap. oh snap!
print "\nLoading page 0.."
u = opener.open("https://web.archive.org" + snap)
s = u.read()
u.close()
# parse page nav; we want a list of pages that need traversing right away
r = re.search(r'<div class="pagelinks floatleft">(.*?)</div>', s)
r = re.findall(r'href=".*?\.(\d+)"', r.group(1))
page_step = int(r[0])
page_max = int(r[len(r) - 1])
pages = range(0, page_max + page_step, page_step)
# time to start grabbing images
images = []
# img_index = grab_images(args.thread_id, img_index)
for i in pages:
# we already have the HTML for the first page, no need to load it again
if i > 0:
print "Loading page %d.." % i
u = opener.open("https://web.archive.org" + snap[:-1] + str(i))
s = u.read()
u.close()
r = re.findall(r'<img src="(\S+)" class=".*?bbc_img"', s)
images += r
# we (hopefully) got them all. save em!
# try to create the directory first
try:
os.mkdir(str(args.thread_id))
# exists already, simply pass
except OSError:
pass
for i in images:
print "Downloading %s.." % i
u = opener.open("https://web.archive.org" + i)
f = open("%d/%d_%s" % (args.thread_id, args.thread_id, i.split("/")[-1]), "wb+")
f.write(u.read())
f.close()
u.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment