Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Python script for a searching duplicate files in folder. Modification of: https://www.pythoncentral.io/finding-duplicate-files-with-python/ (file size counter was added)
# dupFinder.py
import os, sys, stat
import hashlib
def findDup(parentFolder):
# Dups in format {hash:[names]}
dups = {}
for dirName, subdirs, fileList in os.walk(parentFolder):
print('Scanning %s...' % dirName)
for filename in fileList:
# Get the path to the file
path = os.path.join(dirName, filename)
# Calculate hash
file_hash = hashfile(path)
# Add or append the file path
if file_hash in dups:
dups[file_hash]['path'].append(path)
else:
dups[file_hash] = {}
dups[file_hash]['path'] = [path]
if os.path.isfile(path):
dups[file_hash]['size'] = os.stat(path)[stat.ST_SIZE]
return dups
# Joins two dictionaries
def joinDicts(dict1, dict2):
for key in dict2.keys():
if key in dict1:
dict1[key] = dict1[key] + dict2[key]
else:
dict1[key] = dict2[key]
def hashfile(path, blocksize = 65536):
afile = open(path, 'rb')
hasher = hashlib.md5()
buf = afile.read(blocksize)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(blocksize)
afile.close()
return hasher.hexdigest()
def sizeof_fmt(num, suffix):
for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
if abs(num) < 1024.0:
return "%3.1f%s%s" % (num, unit, suffix)
num /= 1024.0
return "%.1f%s%s" % (num, 'Yi', suffix)
def printResults(dict1):
results = list(filter(lambda x: len(x['path']) > 1, dict1.values()))
summ = 0
if len(results) > 0:
results = sorted(results, key=lambda res: res['size'])
print('Duplicates:')
print('___________________')
for result in results:
coef = len(result['path']) - 1
summ += result['size'] * coef
print('\t%s' % sizeof_fmt(result['size'], 'B'))
for subresult in result['path']:
print('\t\t%s' % subresult)
print('___________________')
print('TOTAL SIZE %s' % sizeof_fmt(summ, 'B'))
else:
print('No duplicate files found.')
if __name__ == '__main__':
if len(sys.argv) > 1:
dups = {}
folders = sys.argv[1:]
for i in folders:
# Iterate the folders given
if os.path.exists(i):
# Find the duplicated files and append them to the dups
joinDicts(dups, findDup(i))
else:
print('%s is not a valid path, please verify' % i)
sys.exit()
printResults(dups)
else:
print('Usage: python duplicates_finder.py folder or python duplicates_finder.py folder1 folder2 folder3')
@XBlack97

This comment has been minimized.

Copy link

XBlack97 commented Jun 5, 2018

pls how do i pass in my dir to check...?

@faulander

This comment has been minimized.

Copy link

faulander commented Jan 2, 2019

pls how do i pass in my dir to check...?

Usage: python duplicates_finder.py folder or python duplicates_finder.py folder1 folder2 folder3

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.