Skip to content

Instantly share code, notes, and snippets.

@NeatMonster
Created September 2, 2022 08:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save NeatMonster/d1b069abc41cd371927c4ebbb94113c5 to your computer and use it in GitHub Desktop.
Save NeatMonster/d1b069abc41cd371927c4ebbb94113c5 to your computer and use it in GitHub Desktop.
Simple and fast utility to find duplicated files
import collections
import os
import sys
import hashlib
paths = []
for root, dirs, files in os.walk(sys.argv[1]):
for file in files:
paths.append(os.path.join(root, file))
def find_dupes(func, paths):
dupes = collections.defaultdict(list)
for path in paths:
dupes[func(path)].append(path)
for paths in dupes.values():
if len(paths) > 1:
yield paths
def getdata(path):
with open(path, "rb") as f:
return f.read(4096)
def gethash(path):
with open(path, "rb") as f:
hash = hashlib.md5()
while chunk := f.read(4096):
hash.update(chunk)
return hash.hexdigest()
for paths in find_dupes(os.path.getsize, paths):
for paths in find_dupes(getdata, paths):
for paths in find_dupes(gethash, paths):
print(", ".join(paths))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment