Skip to content

Instantly share code, notes, and snippets.

@astrofrog
Created December 27, 2010 09:36
Show Gist options
  • Save astrofrog/755998 to your computer and use it in GitHub Desktop.
Save astrofrog/755998 to your computer and use it in GitHub Desktop.
Recursively find folders with same size and checksum (still experimental)
import sys
import os
from hashlib import md5
def dir_size(directory):
"Find the total size of a directory"
folder_size = 0
for (path, dirs, files) in os.walk(directory):
for filename in files:
full_path = os.path.join(path, filename)
if '.DS_Store' not in full_path:
folder_size += os.path.getsize(full_path)
return folder_size
def dir_checksum(directory):
"Find the total checksum of a directory"
content = ""
for (path, dirs, files) in os.walk(directory):
for filename in files:
full_path = os.path.join(path, filename)
if '.DS_Store' not in full_path:
content += open(full_path, 'rb').read()
return md5(content)
size_dict = {}
if len(sys.argv) != 2:
print "python find_duplicate_folders.py directory"
sys.exit(0)
folder = sys.argv[1]
for (path, dirs, files) in os.walk(folder):
for directory in dirs:
full_path = os.path.join(path, directory)
try:
size = dir_size(full_path)
if size in size_dict:
if not os.path.dirname(full_path) in size_dict[size]:
size_dict[size].append(full_path)
else:
size_dict[size] = [full_path]
except:
pass
keys = size_dict.keys()
keys.sort()
for key in keys:
if key > 1024**2:
directories = size_dict[key]
if len(directories) > 2:
hashkeys = []
for directory in directories:
hashkey = dir_checksum(directory).hexdigest()
hashkeys.append(hashkey)
if len(set(hashkeys)) == 1:
print ""
print "%12i %s" % (key, hashkey)
for directory in directories:
print "- %s" % directory
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment