Skip to content

Instantly share code, notes, and snippets.

@UNIcodeX
Created July 22, 2014 12:32
Show Gist options
  • Save UNIcodeX/675447ccc9266a64d34e to your computer and use it in GitHub Desktop.
Save UNIcodeX/675447ccc9266a64d34e to your computer and use it in GitHub Desktop.
import hashlib
import os
import imghdr
import datetime
import re
import shutil
import sys
import optparse
import time
def check(dname, fname):
global duplicate_count, file_count, filesPerSecond, h
filename = (dirname+'\\'+fname)
with open(filename, 'rb') as f:
h = hashlib.md5()
while True:
data = f.read(4096)
if not data:
break
h.update(data)
dg = h.digest()
file_count += 1
filesPerSecond = ((file_count/(time.time() - start_time)))
sys.stdout.write(clear_string)
sys.stdout.write('Rate: %d f/s || File: %s\r' % (filesPerSecond, filename))
sys.stdout.flush()
if dg in imgsigs.values():
duplicate_count += 1
if not os.path.exists(options.dest_dir):
os.mkdir(options.dest_dir)
sys.stdout.write(clear_string)
sys.stdout.write(filename+'\n')
with open('results%s.txt' % (strtime), 'a') as results:
results.write(filename+'\n')
shutil.move(filename, options.dest_dir+'\\'+fname)
else:
imgsigs[filename] = dg
return imgsigs
if __name__ == '__main__':
parser = optparse.OptionParser()
parser.add_option('-s', '--source', action='store', dest='source_dir', default='./', help='Specify source directory')
parser.add_option('-d', '--dest', action='store', dest='dest_dir', default='duplicates', help='Specify destination directory for duplicates')
parser.add_option('-i', '--images', action='store_true', dest='limit_to_images', default=False, help='Limit processing to files of type, images (jpg, gif, ...)')
options, remainder = parser.parse_args()
file_count = 0
clear_string = ' '*80+'\r'
dt = datetime.datetime.today()
strtime = str(dt.month)+'-'+str(dt.day)+'-'+str(dt.year)+'--'+re.sub(r'(\:)', '',str(dt.time()))
imgsigs = {}
duplicate_count = 0
start_time = time.time()
filesPerSecond = 0
print('\nDuplicates:')
for dirname, dirlist, filelist in os.walk(options.source_dir):
for filename in filelist:
if options.dest_dir not in dirname:
if sys.argv[0] not in filename:
if options.limit_to_images:
if imghdr.what(dirname+'\\'+filename):
check(dirname, filename)
else:
check(dirname, filename)
if duplicate_count == 0:
sys.stdout.write(clear_string)
sys.stdout.write('None\n\n')
sys.stdout.write(clear_string)
sys.stdout.write('\nAvg. rate: %d f/s' % (filesPerSecond))
sys.stdout.flush()
sys.stdout.write('\n\nFound %f duplicates. Any duplicates have been moved to "%s".\nPress [Enter] to exit program.\n' % (duplicate_count, options.dest_dir))
raw_input()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment