Skip to content

Instantly share code, notes, and snippets.

@EntityReborn
Created April 4, 2011 20:30
Show Gist options
  • Save EntityReborn/902361 to your computer and use it in GitHub Desktop.
Save EntityReborn/902361 to your computer and use it in GitHub Desktop.
simple image downloader, currently used for adult content
import re, os
import urllib2
baseurl = "http://hof.voyeurweb.com/gallery/"
nexturlre = re.compile(r'Overview Page</a> \| <a href="([^"]*)">')
imgurlre = re.compile(r'<img src="([^"]*)" alt="([^"]*)" border="0" />')
def downloadNext(url, cat):
print "==================\nOpening %s"%url
f = urllib2.urlopen(url)
data = f.read()
print "Grabbing next url"
all = nexturlre.findall(data)
print all
if all:
nexturl = all[0]
else:
nexturl = None
print "Grabbing pic url"
picurl = imgurlre.findall(data)[0][0]
print "downloading", picurl
pic = urllib2.urlopen(picurl)
picfilename = picurl.split("/")[-1:][0]
if not os.path.exists("pix/%s"%cat):
os.makedirs("pix/%s"%cat)
if not os.path.exists("pix/%s/%s"%(cat,picfilename)):
picdata = pic.read()
print "writing %s (%d)"%(picfilename, len(picdata))
picfile = open("pix/%s/%s"%(cat, picfilename), "wb")
picfile.write(picdata)
picfile.close()
else:
print "Pic already exists, skipping"
return nexturl if nexturl else False
indexre = re.compile('<a href="([^.:"]*)" class="[^"]*">[^<]*</a>')
f = urllib2.urlopen("http://hof.voyeurweb.com/gallery/")
data = f.read()
cats = indexre.findall(data)
print len(cats),"categories found."
firsturl = re.compile('<small><a href="([^"]*)">[^<]*</a></small>')
for cat in cats:
print "Downloading from", cat
catindex = "http://hof.voyeurweb.com/gallery/" + cat
f = urllib2.urlopen(catindex)
data = f.read()
url = firsturl.findall(data)[0]
print "Starting with", url
while url:
url = downloadNext(baseurl+cat+url, cat)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment