Skip to content

Instantly share code, notes, and snippets.

@dnozay
Created December 28, 2011 22:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dnozay/1530079 to your computer and use it in GitHub Desktop.
Save dnozay/1530079 to your computer and use it in GitHub Desktop.
find duplicate files based on their md5sum
#!/usr/bin/python
# find duplicate files based on their md5sum
import os
import hashlib
import mmap
def chunks(file_):
try:
map = mmap.mmap(file_.fileno(), 0)
except EnvironmentError:
raise StopIteration # size = 0
chunk = map.read(1024)
while chunk:
yield chunk
chunk = map.read(1024)
def visit(md5sums, dirname, names):
'''accumulate md5 hex digests for files
md5sums - dict to augment; key,[files]
dirname - parent directory
names - [filenames]'''
for name in names:
path = os.path.join(dirname, name)
if not os.path.isfile(path):
continue
file_ = None
try:
file_ = open(path, 'r+b')
md5 = hashlib.md5()
for chunk in chunks(file_):
md5.update(chunk)
md5sums.setdefault(md5.hexdigest(), []).append(path)
finally:
if file_:
file_.close()
def find_dupes(rootpath):
'''find duplicate files (note: subject to md5 digest collision)
rootpath - parent directory'''
md5sums = {}
os.path.walk(rootpath, visit, md5sums)
for hash, paths in md5sums.iteritems():
if len(paths) < 2:
continue
print hash
for path in paths:
print '\t', path
find_dupes('.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment