Skip to content

Instantly share code, notes, and snippets.

@afternoon
Created July 17, 2012 22:44
Show Gist options
  • Save afternoon/3132629 to your computer and use it in GitHub Desktop.
Save afternoon/3132629 to your computer and use it in GitHub Desktop.
Find duplicate files in a directory (optionally only check those matching a filename pattern)
#!/usr/bin/env python
from fnmatch import fnmatch
from os import walk
from os.path import join
from hashlib import md5
from sys import argv
def recursive_glob(rootdir='.', pattern='*'):
for rootdir, dirnames, filenames in walk(rootdir):
for filename in filenames:
if fnmatch(filename, pattern):
yield join(rootdir, filename)
def file_md5(filename):
return md5(open(filename, 'rb').read()).hexdigest()
def find_duplicate_files(rootdir='.', pattern='*'):
duplicates = {}
for fn in recursive_glob(rootdir, pattern):
hexhash = file_md5(fn)
if hexhash in duplicates:
duplicates[hexhash].append(fn)
else:
duplicates[hexhash] = [fn]
return duplicates
def show_duplicate_files(rootdir, pattern):
for hexhash, filelist in find_duplicate_files(rootdir, pattern).iteritems():
if len(filelist) > 1:
print '%s (%s files)' % (hexhash, len(filelist))
for fn in filelist:
print ' ', fn
if __name__ == '__main__':
show_duplicate_files(argv[1], argv[2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment