Finds duplicate files probablistically and quickly.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
from paste.util.multidict import MultiDict | |
from pprint import pprint | |
import hashlib | |
import os | |
import sys | |
def md5(file_path, block_size=2**18): | |
"""Compute md5 hash of first ``block_size`` of the specified file""" | |
with open(file_path,'rb') as f: | |
return hashlib.md5(f.read(block_size)).digest() | |
if __name__ == "__main__": | |
d = MultiDict() | |
for directory in sys.argv[1:]: | |
print 'scanning', directory | |
i = 1 | |
for root, dirs, files in os.walk(directory): | |
for name in files: | |
path = os.path.join(root, name) | |
# skip if unreadable or empty | |
if not os.access(path, os.R_OK): | |
print 'unreable file:', path | |
continue | |
size = os.path.getsize(path) | |
if size == 0: | |
print 'empty file:', path | |
continue | |
# hash the file | |
hash = md5(path) | |
d.add(hash, (path, size)) | |
# check if it has duplicates | |
files_with_hash = d.getall(hash) | |
if 1 < len(files_with_hash) < 10: | |
pprint(files_with_hash) | |
print '' | |
# progress monitoring | |
i += 1 | |
if i % 50 == 0: | |
print i | |
# sort by size | |
dupes = [files for hash, files in d.dict_of_lists().iteritems() if len(files) > 1] | |
get_size = lambda x: x[0][1] | |
dupes.sort(key=get_size, reverse=True) | |
# final output | |
print '=' * 80 | |
for files in dupes: | |
print get_size(files) | |
pprint([path for path, size in files]) | |
print '' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment