Skip to content

Instantly share code, notes, and snippets.

@shurup14
Created January 16, 2017 12:19
Show Gist options
  • Save shurup14/c932a7917c94a691099f4a0612529044 to your computer and use it in GitHub Desktop.
Save shurup14/c932a7917c94a691099f4a0612529044 to your computer and use it in GitHub Desktop.
Duplicate files search. Py3
import os
import hashlib
def check_size(path, files):
suspects = {}
for file in files:
size = os.path.getsize(os.path.join(path, file))
if size in suspects:
suspects[size].append(file)
else:
suspects[size] = [file]
return (v for v in suspects.values() if len(v) > 1)
def check_hash(path, suspects, chunk=64*1024):
for files in suspects:
duplicated = {}
for file in files:
with open(os.path.join(path, file), 'rb') as f:
hasher = hashlib.md5()
buf = f.read(chunk)
while len(buf) > 0:
hasher.update(buf)
buf = f.read(chunk)
md5 = hasher.digest()
duplicate = duplicated.get(md5, None)
if duplicate:
print("Duplicate detected: %s and %s" % (file, duplicate))
else:
duplicated[md5] = file
def find_duplicates(directory):
files = [f for f in os.listdir(directory) if
os.path.isfile(os.path.join(directory, f))]
suspects = check_size(directory, files)
check_hash(directory, suspects)
if __name__ == '__main__':
while True:
directory = input('Please provide directory path: \n')
if os.path.exists(directory):
find_duplicates(directory)
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment