Skip to content

Instantly share code, notes, and snippets.

@Ennea
Last active September 4, 2022 07:04
Show Gist options
  • Save Ennea/92fa14fde3f466837467a5a99382cf52 to your computer and use it in GitHub Desktop.
Save Ennea/92fa14fde3f466837467a5a99382cf52 to your computer and use it in GitHub Desktop.
Download images for old Pixelation threads from the Internet Archive. Not functional anymore due to archive.org changes, sadly :(
#!/usr/bin/python
import argparse
import os
import re
import urllib2
# argparse. only argument is the thread id, but this is useful regardless
parser = argparse.ArgumentParser(description="Grab thread images from archive.org snapshots.")
parser.add_argument("thread_id", type=int, help="the thread id to get the images for")
args = parser.parse_args()
# create an opener using urllib2, so we can have a more or less legit user agent
opener = urllib2.build_opener()
opener.addheaders = [("User-agent", "Mozilla/5.0")]
# see if we can actually find any snapshots
print "Looking for snapshots..."
u = opener.open("https://web.archive.org/web/*/http://wayofthepixel.net/index.php?topic=" + str(args.thread_id) + ".0")
s = u.read()
u.close()
r = re.findall(r'"date captures"[\s\S]+?<a href="(.+?)"', s)
# decide on a snapshot to use
# WIP: currently using the latest one. might not be the best idea
if len(r) > 0:
print "Snapshot(s) found! Using latest one available"
snap = r[len(r) - 1]
else:
print "No snapshots. Exiting.."
exit()
# open up the snap. oh snap!
print "\nLoading page 0.."
u = opener.open("https://web.archive.org" + snap)
s = u.read()
u.close()
# parse page nav; we want a list of pages that need traversing right away
r = re.search(r'<div class="pagelinks floatleft">(.*?)</div>', s)
r = re.findall(r'href=".*?\.(\d+)"', r.group(1))
page_step = int(r[0])
page_max = int(r[len(r) - 1])
pages = range(0, page_max + page_step, page_step)
# time to start grabbing images
images = []
# img_index = grab_images(args.thread_id, img_index)
for i in pages:
# we already have the HTML for the first page, no need to load it again
if i > 0:
print "Loading page %d.." % i
u = opener.open("https://web.archive.org" + snap[:-1] + str(i))
s = u.read()
u.close()
r = re.findall(r'<img src="(\S+)" class=".*?bbc_img"', s)
images += r
# we (hopefully) got them all. save em!
# try to create the directory first
try:
os.mkdir(str(args.thread_id))
# exists already, simply pass
except OSError:
pass
for i in images:
print "Downloading %s.." % i
u = opener.open("https://web.archive.org" + i)
f = open("%d/%d_%s" % (args.thread_id, args.thread_id, i.split("/")[-1]), "wb+")
f.write(u.read())
f.close()
u.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment