Ennea/pixelation_image_grabber.py Secret

## pixelation_image_grabber.py
#!/usr/bin/python

import argparse
import os
import re
import urllib2


# argparse. only argument is the thread id, but this is useful regardless
parser = argparse.ArgumentParser(description="Grab thread images from archive.org snapshots.")
parser.add_argument("thread_id", type=int, help="the thread id to get the images for")

args = parser.parse_args()


# create an opener using urllib2, so we can have a more or less legit user agent
opener = urllib2.build_opener()
opener.addheaders = [("User-agent", "Mozilla/5.0")]


# see if we can actually find any snapshots
print "Looking for snapshots..."
u = opener.open("https://web.archive.org/web/*/http://wayofthepixel.net/index.php?topic=" + str(args.thread_id) + ".0")
s = u.read()
u.close()

r = re.findall(r'"date captures"[\s\S]+?<a href="(.+?)"', s)

# decide on a snapshot to use
# WIP: currently using the latest one. might not be the best idea
if len(r) > 0:
    print "Snapshot(s) found! Using latest one available"
    snap = r[len(r) - 1]
else:
    print "No snapshots. Exiting.."
    exit()


# open up the snap. oh snap!
print "\nLoading page 0.."
u = opener.open("https://web.archive.org" + snap)
s = u.read()
u.close()

# parse page nav; we want a list of pages that need traversing right away
r = re.search(r'<div class="pagelinks floatleft">(.*?)</div>', s)
r = re.findall(r'href=".*?\.(\d+)"', r.group(1))

page_step = int(r[0])
page_max = int(r[len(r) - 1])
pages = range(0, page_max + page_step, page_step)


# time to start grabbing images
images = []
# img_index = grab_images(args.thread_id, img_index)

for i in pages:
    # we already have the HTML for the first page, no need to load it again
    if i > 0:
        print "Loading page %d.." % i
        u = opener.open("https://web.archive.org" + snap[:-1] + str(i))
        s = u.read()
        u.close()

    r = re.findall(r'<img src="(\S+)" class=".*?bbc_img"', s)
    images += r


# we (hopefully) got them all. save em!
# try to create the directory first
try:
    os.mkdir(str(args.thread_id))

# exists already, simply pass
except OSError:
    pass

for i in images:
    print "Downloading %s.." % i
    u = opener.open("https://web.archive.org" + i)
    f = open("%d/%d_%s" % (args.thread_id, args.thread_id, i.split("/")[-1]), "wb+")
    f.write(u.read())
    f.close()
    u.close()
	#!/usr/bin/python

	import argparse
	import os
	import re
	import urllib2


	# argparse. only argument is the thread id, but this is useful regardless
	parser = argparse.ArgumentParser(description="Grab thread images from archive.org snapshots.")
	parser.add_argument("thread_id", type=int, help="the thread id to get the images for")

	args = parser.parse_args()


	# create an opener using urllib2, so we can have a more or less legit user agent
	opener = urllib2.build_opener()
	opener.addheaders = [("User-agent", "Mozilla/5.0")]


	# see if we can actually find any snapshots
	print "Looking for snapshots..."
	u = opener.open("https://web.archive.org/web/*/http://wayofthepixel.net/index.php?topic=" + str(args.thread_id) + ".0")
	s = u.read()
	u.close()

	r = re.findall(r'"date captures"[\s\S]+?<a href="(.+?)"', s)

	# decide on a snapshot to use
	# WIP: currently using the latest one. might not be the best idea
	if len(r) > 0:
	print "Snapshot(s) found! Using latest one available"
	snap = r[len(r) - 1]
	else:
	print "No snapshots. Exiting.."
	exit()


	# open up the snap. oh snap!
	print "\nLoading page 0.."
	u = opener.open("https://web.archive.org" + snap)
	s = u.read()
	u.close()

	# parse page nav; we want a list of pages that need traversing right away
	r = re.search(r'<div class="pagelinks floatleft">(.*?)</div>', s)
	r = re.findall(r'href=".*?\.(\d+)"', r.group(1))

	page_step = int(r[0])
	page_max = int(r[len(r) - 1])
	pages = range(0, page_max + page_step, page_step)


	# time to start grabbing images
	images = []
	# img_index = grab_images(args.thread_id, img_index)

	for i in pages:
	# we already have the HTML for the first page, no need to load it again
	if i > 0:
	print "Loading page %d.." % i
	u = opener.open("https://web.archive.org" + snap[:-1] + str(i))
	s = u.read()
	u.close()

	r = re.findall(r'<img src="(\S+)" class=".*?bbc_img"', s)
	images += r


	# we (hopefully) got them all. save em!
	# try to create the directory first
	try:
	os.mkdir(str(args.thread_id))

	# exists already, simply pass
	except OSError:
	pass

	for i in images:
	print "Downloading %s.." % i
	u = opener.open("https://web.archive.org" + i)
	f = open("%d/%d_%s" % (args.thread_id, args.thread_id, i.split("/")[-1]), "wb+")
	f.write(u.read())
	f.close()
	u.close()