EntityReborn/downloader.py

## downloader.py
import re, os
import urllib2

baseurl = "http://hof.voyeurweb.com/gallery/"
nexturlre = re.compile(r'Overview Page</a> \| <a href="([^"]*)">')
imgurlre = re.compile(r'<img src="([^"]*)" alt="([^"]*)" border="0" />')

def downloadNext(url, cat):
    print "==================\nOpening %s"%url
    f = urllib2.urlopen(url)
    data = f.read()

    print "Grabbing next url"
    all = nexturlre.findall(data)
    print all
    if all:
        nexturl = all[0]
    else:
        nexturl = None

    print "Grabbing pic url"
    picurl = imgurlre.findall(data)[0][0]

    print "downloading", picurl
    pic = urllib2.urlopen(picurl)
    picfilename = picurl.split("/")[-1:][0]
    if not os.path.exists("pix/%s"%cat):
        os.makedirs("pix/%s"%cat)
    if not os.path.exists("pix/%s/%s"%(cat,picfilename)):

        picdata = pic.read()
        print "writing %s (%d)"%(picfilename, len(picdata))
        picfile = open("pix/%s/%s"%(cat, picfilename), "wb")
        picfile.write(picdata)
        picfile.close()
    else:
        print "Pic already exists, skipping"

    return nexturl if nexturl else False

indexre = re.compile('<a href="([^.:"]*)" class="[^"]*">[^<]*</a>')

f = urllib2.urlopen("http://hof.voyeurweb.com/gallery/")
data = f.read()
cats = indexre.findall(data)
print len(cats),"categories found."
firsturl = re.compile('<small><a href="([^"]*)">[^<]*</a></small>')

for cat in cats:
    print "Downloading from", cat
    catindex = "http://hof.voyeurweb.com/gallery/" + cat

    f = urllib2.urlopen(catindex)
    data = f.read()
    url = firsturl.findall(data)[0]
    print "Starting with", url

    while url:
        url = downloadNext(baseurl+cat+url, cat)
	import re, os
	import urllib2

	baseurl = "http://hof.voyeurweb.com/gallery/"
	nexturlre = re.compile(r'Overview Page</a> \\| <a href="([^"]*)">')
	imgurlre = re.compile(r'<img src="([^"])" alt="([^"])" border="0" />')

	def downloadNext(url, cat):
	print "==================\nOpening %s"%url
	f = urllib2.urlopen(url)
	data = f.read()

	print "Grabbing next url"
	all = nexturlre.findall(data)
	print all
	if all:
	nexturl = all[0]
	else:
	nexturl = None

	print "Grabbing pic url"
	picurl = imgurlre.findall(data)[0][0]

	print "downloading", picurl
	pic = urllib2.urlopen(picurl)
	picfilename = picurl.split("/")[-1:][0]
	if not os.path.exists("pix/%s"%cat):
	os.makedirs("pix/%s"%cat)
	if not os.path.exists("pix/%s/%s"%(cat,picfilename)):

	picdata = pic.read()
	print "writing %s (%d)"%(picfilename, len(picdata))
	picfile = open("pix/%s/%s"%(cat, picfilename), "wb")
	picfile.write(picdata)
	picfile.close()
	else:
	print "Pic already exists, skipping"

	return nexturl if nexturl else False

	indexre = re.compile('<a href="([^.:"])" class="[^"]">[^<]*</a>')

	f = urllib2.urlopen("http://hof.voyeurweb.com/gallery/")
	data = f.read()
	cats = indexre.findall(data)
	print len(cats),"categories found."
	firsturl = re.compile('<small><a href="([^"])">[^<]</a></small>')

	for cat in cats:
	print "Downloading from", cat
	catindex = "http://hof.voyeurweb.com/gallery/" + cat

	f = urllib2.urlopen(catindex)
	data = f.read()
	url = firsturl.findall(data)[0]
	print "Starting with", url

	while url:
	url = downloadNext(baseurl+cat+url, cat)