dnozay/find_dupes.py

## find_dupes.py
#!/usr/bin/python
# find duplicate files based on their md5sum

import os
import hashlib
import mmap

def chunks(file_):
   try:
      map = mmap.mmap(file_.fileno(), 0)
   except EnvironmentError:
      raise StopIteration # size = 0
   chunk = map.read(1024)
   while chunk:
      yield chunk
      chunk = map.read(1024)

def visit(md5sums, dirname, names):
   '''accumulate md5 hex digests for files
   md5sums - dict to augment; key,[files]
   dirname - parent directory
   names - [filenames]'''
   for name in names:
      path = os.path.join(dirname, name)
      if not os.path.isfile(path):
         continue
      file_ = None
      try:
         file_ = open(path, 'r+b')
         md5 = hashlib.md5()
         for chunk in chunks(file_):
            md5.update(chunk)
         md5sums.setdefault(md5.hexdigest(), []).append(path)
      finally:
         if file_:
            file_.close()

def find_dupes(rootpath):
   '''find duplicate files (note: subject to md5 digest collision)
   rootpath - parent directory'''
   md5sums = {}
   os.path.walk(rootpath, visit, md5sums)
   for hash, paths in md5sums.iteritems():
      if len(paths) < 2:
         continue
      print hash
      for path in paths:
         print '\t', path

find_dupes('.')
	#!/usr/bin/python
	# find duplicate files based on their md5sum

	import os
	import hashlib
	import mmap

	def chunks(file_):
	try:
	map = mmap.mmap(file_.fileno(), 0)
	except EnvironmentError:
	raise StopIteration # size = 0
	chunk = map.read(1024)
	while chunk:
	yield chunk
	chunk = map.read(1024)

	def visit(md5sums, dirname, names):
	'''accumulate md5 hex digests for files
	md5sums - dict to augment; key,[files]
	dirname - parent directory
	names - [filenames]'''
	for name in names:
	path = os.path.join(dirname, name)
	if not os.path.isfile(path):
	continue
	file_ = None
	try:
	file_ = open(path, 'r+b')
	md5 = hashlib.md5()
	for chunk in chunks(file_):
	md5.update(chunk)
	md5sums.setdefault(md5.hexdigest(), []).append(path)
	finally:
	if file_:
	file_.close()

	def find_dupes(rootpath):
	'''find duplicate files (note: subject to md5 digest collision)
	rootpath - parent directory'''
	md5sums = {}
	os.path.walk(rootpath, visit, md5sums)
	for hash, paths in md5sums.iteritems():
	if len(paths) < 2:
	continue
	print hash
	for path in paths:
	print '\t', path

	find_dupes('.')