Created
January 31, 2014 08:13
-
-
Save ohhdemgirls/8728258 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import subprocess | |
import urllib.request | |
import os | |
import sys | |
import requests | |
import threading | |
import timeit | |
from multiprocessing.pool import ThreadPool | |
imgUrl = "http://i.imgur.com/" | |
dlPath = "./temp/" | |
finalPath = "./output/" | |
shaSum = "9b5936f4006146e4e1e9025b474c02863c0b5614132ad40db4b925a10e8bfbb9" | |
nLibReqErrors=0 | |
nUrlsTried=0 | |
nImgSearch=0 | |
nUniqueImg=0 | |
nNoRErrors=0 | |
startTime=timeit.default_timer() | |
def randomnes(): | |
ext = ".jpg" | |
r1 = random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') | |
r2 = random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') | |
r3 = random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') | |
r4 = random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') | |
r5 = random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') | |
rT = r1 + r2 + r3 + r4 + r5 | |
rTE = rT + ext | |
rTFull = imgUrl + rTE | |
return (rTFull, rTE) | |
def downloadImages(rTFull, dlPath, filename): | |
global nLibReqErrors,nNoRErrors | |
local_file_name = dlPath + filename | |
with open(local_file_name, 'wb') as f: | |
try: | |
r = urllib.request.urlopen(rTFull).read() | |
except: | |
# print ("ERROR IN URLLIB.REQUEST!! bypassing this url"); | |
nLibReqErrors+=1 | |
if (r): | |
f.write(r) | |
else: | |
nNoRErrors+=1 | |
f.close() | |
return local_file_name | |
def check_sha256sum(f, shaSum): | |
file_name = f | |
output = subprocess.check_output(['sha256sum', file_name]) | |
newShaSum = output[:64] | |
newShaSum = newShaSum.decode("utf-8") | |
if newShaSum == shaSum: | |
os.remove(file_name) | |
shortFileName = file_name[9:] | |
# print ("Placeholder found. Deleting... \n") | |
return 0 | |
else: | |
#print ("Real picture found") | |
return 1 | |
def check_output_dir(f): | |
outputDir = os.path.dirname(f) | |
print ("Checking if output folders exist... If they don't they will be created now.") | |
if not os.path.exists(outputDir): | |
os.makedirs(outputDir) | |
check_output_dir(dlPath) | |
check_output_dir(finalPath) | |
def rndImgUrl(): | |
global nUrlsTried | |
nUrlsTried+=1 | |
rTFull = randomnes() | |
local_file_name = downloadImages(rTFull[0], dlPath, rTFull[1]) | |
gotImage=check_sha256sum(local_file_name, shaSum) | |
if (gotImage>0): | |
return rTFull[1] | |
else: | |
return 0 | |
def googleSearch(threadID, url, filename): | |
global nImgSearch,nUniqueImg | |
headers = {} | |
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17" | |
url= 'http://www.google.com/searchbyimage?image_url=' + url | |
r = requests.get(url , headers=headers) | |
nImgSearch+=1 | |
if (r.text.find("Pages that include matching images")>-1): | |
os.remove(dlPath + filename) | |
else: | |
os.rename(dlPath+fileName, finalPath+fileName) | |
nUniqueImg+=1 | |
# print ("Found UNIQUE image") | |
def showInfo(): | |
os.system('clear') | |
if (nUrlsTried>0 and nImgSearch>0): | |
timeElapsed=timeit.default_timer()-startTime | |
print ("Number of URLs tried: ", nUrlsTried, " | Valid: ", nImgSearch/nUrlsTried*100, "%") | |
print ("Number of images reverse searched: ", nImgSearch) | |
print ("Number of unique images found: ", nUniqueImg, " | Percentage: ", nUniqueImg/nImgSearch*100) | |
print ("") | |
print ("Number of images in queue: ", len(urlsInQueue)) | |
print ("Number of librequest errors: ", nLibReqErrors) | |
print ("Number of noR errors: ", nNoRErrors) | |
print ("") | |
print ("Runing for : ",timeElapsed, "seconds") | |
print ("Image Searches per second : ", nImgSearch/timeElapsed) | |
#print ("Average of URLs tried: ", nUrlsTried) | |
#print ("Number of images reverse searched: ", nImgSearch) | |
#print ("Number of unique images found: ", nUniqueImg) | |
else: | |
print ("Number of URLs tried: ", nUrlsTried) | |
print ("Number of images reverse searched: ", nImgSearch) | |
print ("Number of unique images found: ", nUniqueImg) | |
print ("") | |
print ("Number of images in queue: ", len(urlsInQueue)) | |
print ("Number of librequest errors: ", nLibReqErrors) | |
print ("Number of noR errors: ", nNoRErrors) | |
print ("") | |
print ("Runing for : ", timeit.default_timer()-startTime, "seconds") | |
urlsInQueue=[] | |
fileNames=[] | |
pool = ThreadPool(processes=1) | |
t = threading.Thread(target=googleSearch, args=(0, "", "",)) | |
while 1: | |
if (len(urlsInQueue)>=1 and not t.isAlive()): | |
fileName=fileNames.pop() | |
t = threading.Thread(target=googleSearch, args=(0, urlsInQueue.pop(), fileName,)) | |
t.start() | |
showInfo() | |
if (len(urlsInQueue)<10): | |
async_result = pool.apply_async(rndImgUrl) | |
return_val = async_result.get() | |
if (return_val!=0): | |
urlsInQueue.append(imgUrl + return_val) | |
fileNames.append(return_val) | |
showInfo() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment