guyc/find-duplicates.py

## find-duplicates.py
#!/usr/bin/env python

# Picasa sometimes duplicates imports and it adds a -NNN
# suffix (older versions seemed to use -N).  This code
# finds files with this naming scheme, and looks for the base
# file in the same dirctory.  If it finds the base, and the
# base has the same length, same md5 hash, the duplicate
# is suggested for deletion.
# Output is a set of 'rm <filename>' commands that can
# be run as a script.

import os
import re
import hashlib

def md5Checksum(filePath):
    with open(filePath, 'rb') as fh:
        m = hashlib.md5()
        while True:
            data = fh.read(8192)
            if not data:
                break
            m.update(data)
        return m.hexdigest()

dupBytes = 0
dupFiles = 0
totalFiles = 0

for root, dirs, files in os.walk('.'):
      for name in files:
          totalFiles += 1
          result = re.match('(.*)\-\d+(\..*)', name)
          if result:
              file1 = os.path.join(root,name)
              file0 = os.path.join(root,result.group(1)+result.group(2))
              if os.path.exists(file0):
                  size0 = os.path.getsize(file0)
                  size1 = os.path.getsize(file1)
                  if size0 == size1:
                     if md5Checksum(file0)==md5Checksum(file1):
                         print "rm %s" % file1
                         dupBytes += size0
                         dupFiles += 1


print "# %d of %d files are duplicates." % (dupFiles, totalFiles)
print "# Total bytes in duplicates: %d" % dupBytes
	#!/usr/bin/env python

	# Picasa sometimes duplicates imports and it adds a -NNN
	# suffix (older versions seemed to use -N). This code
	# finds files with this naming scheme, and looks for the base
	# file in the same dirctory. If it finds the base, and the
	# base has the same length, same md5 hash, the duplicate
	# is suggested for deletion.
	# Output is a set of 'rm <filename>' commands that can
	# be run as a script.

	import os
	import re
	import hashlib

	def md5Checksum(filePath):
	with open(filePath, 'rb') as fh:
	m = hashlib.md5()
	while True:
	data = fh.read(8192)
	if not data:
	break
	m.update(data)
	return m.hexdigest()

	dupBytes = 0
	dupFiles = 0
	totalFiles = 0

	for root, dirs, files in os.walk('.'):
	for name in files:
	totalFiles += 1
	result = re.match('(.)\-\d+(\..)', name)
	if result:
	file1 = os.path.join(root,name)
	file0 = os.path.join(root,result.group(1)+result.group(2))
	if os.path.exists(file0):
	size0 = os.path.getsize(file0)
	size1 = os.path.getsize(file1)
	if size0 == size1:
	if md5Checksum(file0)==md5Checksum(file1):
	print "rm %s" % file1
	dupBytes += size0
	dupFiles += 1


	print "# %d of %d files are duplicates." % (dupFiles, totalFiles)
	print "# Total bytes in duplicates: %d" % dupBytes