Skip to content

Instantly share code, notes, and snippets.

Created April 12, 2011 00:10
What would you like to do?
Finds duplicate files probablistically and quickly.
from paste.util.multidict import MultiDict
from pprint import pprint
import hashlib
import os
import sys
def md5(file_path, block_size=2**18):
"""Compute md5 hash of first ``block_size`` of the specified file"""
with open(file_path,'rb') as f:
return hashlib.md5(
if __name__ == "__main__":
d = MultiDict()
for directory in sys.argv[1:]:
print 'scanning', directory
i = 1
for root, dirs, files in os.walk(directory):
for name in files:
path = os.path.join(root, name)
# skip if unreadable or empty
if not os.access(path, os.R_OK):
print 'unreable file:', path
size = os.path.getsize(path)
if size == 0:
print 'empty file:', path
# hash the file
hash = md5(path)
d.add(hash, (path, size))
# check if it has duplicates
files_with_hash = d.getall(hash)
if 1 < len(files_with_hash) < 10:
print ''
# progress monitoring
i += 1
if i % 50 == 0:
print i
# sort by size
dupes = [files for hash, files in d.dict_of_lists().iteritems() if len(files) > 1]
get_size = lambda x: x[0][1]
dupes.sort(key=get_size, reverse=True)
# final output
print '=' * 80
for files in dupes:
print get_size(files)
pprint([path for path, size in files])
print ''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment