Created
August 26, 2012 21:00
-
-
Save thenonameguy/3483567 to your computer and use it in GitHub Desktop.
Download a 4chan thread to a directory
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import urllib | |
import urllib2 | |
import re | |
import time | |
if not len(sys.argv) >= 3: | |
print "Missing parameters." | |
print "Usage: python 4chan.py <url> <folder>" | |
sys.exit() | |
threadurl = sys.argv[1] | |
subfolder = sys.argv[2] | |
exp_imgurl = re.compile('4chan\.org/\w+/src/\d+\.(?:jpg|gif|png|jpeg)') | |
exp_picname = re.compile('\d+\.(?:jpg|gif|png|jpeg)') | |
ua = "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.9.1.4) Gecko/20091007 Firefox/3.5.4" | |
head = {'User-agent': ua} | |
print "Thread %s going to folder %s" % (threadurl, subfolder) | |
print "Fetching html..." | |
req = urllib2.Request(threadurl, None, head) | |
try: | |
response = urllib2.urlopen(req) | |
except urllib2.HTTPError, e: | |
if errorcount < 1: | |
errorcount = 1 | |
print "Request failed" | |
response = urllib2.urlopen(req) | |
except urllib2.URLError, e: | |
if errorcount < 1: | |
errorcount = 1 | |
print "Request failed" | |
response = urllib2.urlopen(req) | |
msg = response.read() | |
errorcount = 0 | |
print "Received %d bytes" % len(msg) | |
imgurls = exp_imgurl.findall(msg) | |
print "Found %d images" % len(imgurls) | |
if not os.path.exists(subfolder): | |
print "Folder %s does not exist. Creating..." % subfolder | |
os.makedirs(subfolder) | |
else: | |
print "Folder %s exists. I will just put all files in there." % subfolder | |
totalnumber = len(list(set(imgurls))) | |
for i, img in enumerate(list(set(imgurls))): | |
source = "http://images."+str(img) | |
filename = exp_picname.findall(source)[0] | |
destination = os.path.join(subfolder, filename) | |
if not os.path.isfile(destination): | |
try: | |
print "Downloading %d/%d: %s" % (i+1, totalnumber, source) | |
urllib.urlretrieve(source, destination) | |
time.sleep(0.25) # why? | |
except urllib.ContentTooShortError: | |
print "Image download failed, retrying..." | |
time.sleep(1) | |
urllib.urlretrieve(source, destination) | |
time.sleep(0.5) # why? | |
else: | |
print "File %s exists. Skipping..." % str(filename) | |
print "Aaaaaaand we are done. See you next time." | |
print "by thenonameguy <3" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment