Skip to content

Instantly share code, notes, and snippets.

@markuskreitzer
Created December 5, 2022 03:10
Show Gist options
  • Save markuskreitzer/b95579e4a811683f1f542272c57de583 to your computer and use it in GitHub Desktop.
Save markuskreitzer/b95579e4a811683f1f542272c57de583 to your computer and use it in GitHub Desktop.
Find Duplicate Files Fast!
#!/usr/bin/env python3
from pathlib import Path
import sys
import hashlib
my_path = Path(sys.argv[1])
b = {}
def filehash(filename: Path):
with filename.open("rb") as f:
file_hash = hashlib.blake2b()
while chunk := f.read(8192):
file_hash.update(chunk)
return file_hash.hexdigest()
def find_files(p: Path) -> []:
for f in p.iterdir():
if f.is_file():
# Its a file!
hash = filehash(f)
if hash in b:
b[hash]['cnt'] = b[hash]['cnt'] + 1
b[hash]['path'].append(str(f))
else:
b[hash] = {}
b[hash]['cnt'] = 1
b[hash]['path'] = [str(f)]
else:
find_files(f)
find_files(my_path)
for k,v in b.items():
if v['cnt'] > 1:
print(k[-16:])
for i in v['path']:
print(f'\t{i}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment