Created
July 22, 2014 12:32
-
-
Save UNIcodeX/675447ccc9266a64d34e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import hashlib | |
import os | |
import imghdr | |
import datetime | |
import re | |
import shutil | |
import sys | |
import optparse | |
import time | |
def check(dname, fname): | |
global duplicate_count, file_count, filesPerSecond, h | |
filename = (dirname+'\\'+fname) | |
with open(filename, 'rb') as f: | |
h = hashlib.md5() | |
while True: | |
data = f.read(4096) | |
if not data: | |
break | |
h.update(data) | |
dg = h.digest() | |
file_count += 1 | |
filesPerSecond = ((file_count/(time.time() - start_time))) | |
sys.stdout.write(clear_string) | |
sys.stdout.write('Rate: %d f/s || File: %s\r' % (filesPerSecond, filename)) | |
sys.stdout.flush() | |
if dg in imgsigs.values(): | |
duplicate_count += 1 | |
if not os.path.exists(options.dest_dir): | |
os.mkdir(options.dest_dir) | |
sys.stdout.write(clear_string) | |
sys.stdout.write(filename+'\n') | |
with open('results%s.txt' % (strtime), 'a') as results: | |
results.write(filename+'\n') | |
shutil.move(filename, options.dest_dir+'\\'+fname) | |
else: | |
imgsigs[filename] = dg | |
return imgsigs | |
if __name__ == '__main__': | |
parser = optparse.OptionParser() | |
parser.add_option('-s', '--source', action='store', dest='source_dir', default='./', help='Specify source directory') | |
parser.add_option('-d', '--dest', action='store', dest='dest_dir', default='duplicates', help='Specify destination directory for duplicates') | |
parser.add_option('-i', '--images', action='store_true', dest='limit_to_images', default=False, help='Limit processing to files of type, images (jpg, gif, ...)') | |
options, remainder = parser.parse_args() | |
file_count = 0 | |
clear_string = ' '*80+'\r' | |
dt = datetime.datetime.today() | |
strtime = str(dt.month)+'-'+str(dt.day)+'-'+str(dt.year)+'--'+re.sub(r'(\:)', '',str(dt.time())) | |
imgsigs = {} | |
duplicate_count = 0 | |
start_time = time.time() | |
filesPerSecond = 0 | |
print('\nDuplicates:') | |
for dirname, dirlist, filelist in os.walk(options.source_dir): | |
for filename in filelist: | |
if options.dest_dir not in dirname: | |
if sys.argv[0] not in filename: | |
if options.limit_to_images: | |
if imghdr.what(dirname+'\\'+filename): | |
check(dirname, filename) | |
else: | |
check(dirname, filename) | |
if duplicate_count == 0: | |
sys.stdout.write(clear_string) | |
sys.stdout.write('None\n\n') | |
sys.stdout.write(clear_string) | |
sys.stdout.write('\nAvg. rate: %d f/s' % (filesPerSecond)) | |
sys.stdout.flush() | |
sys.stdout.write('\n\nFound %f duplicates. Any duplicates have been moved to "%s".\nPress [Enter] to exit program.\n' % (duplicate_count, options.dest_dir)) | |
raw_input() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment