Created
December 27, 2010 09:36
-
-
Save astrofrog/755998 to your computer and use it in GitHub Desktop.
Recursively find folders with same size and checksum (still experimental)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
from hashlib import md5 | |
def dir_size(directory): | |
"Find the total size of a directory" | |
folder_size = 0 | |
for (path, dirs, files) in os.walk(directory): | |
for filename in files: | |
full_path = os.path.join(path, filename) | |
if '.DS_Store' not in full_path: | |
folder_size += os.path.getsize(full_path) | |
return folder_size | |
def dir_checksum(directory): | |
"Find the total checksum of a directory" | |
content = "" | |
for (path, dirs, files) in os.walk(directory): | |
for filename in files: | |
full_path = os.path.join(path, filename) | |
if '.DS_Store' not in full_path: | |
content += open(full_path, 'rb').read() | |
return md5(content) | |
size_dict = {} | |
if len(sys.argv) != 2: | |
print "python find_duplicate_folders.py directory" | |
sys.exit(0) | |
folder = sys.argv[1] | |
for (path, dirs, files) in os.walk(folder): | |
for directory in dirs: | |
full_path = os.path.join(path, directory) | |
try: | |
size = dir_size(full_path) | |
if size in size_dict: | |
if not os.path.dirname(full_path) in size_dict[size]: | |
size_dict[size].append(full_path) | |
else: | |
size_dict[size] = [full_path] | |
except: | |
pass | |
keys = size_dict.keys() | |
keys.sort() | |
for key in keys: | |
if key > 1024**2: | |
directories = size_dict[key] | |
if len(directories) > 2: | |
hashkeys = [] | |
for directory in directories: | |
hashkey = dir_checksum(directory).hexdigest() | |
hashkeys.append(hashkey) | |
if len(set(hashkeys)) == 1: | |
print "" | |
print "%12i %s" % (key, hashkey) | |
for directory in directories: | |
print "- %s" % directory |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment