Skip to content

Instantly share code, notes, and snippets.

@guyc
Created April 26, 2013 03:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save guyc/5464921 to your computer and use it in GitHub Desktop.
Save guyc/5464921 to your computer and use it in GitHub Desktop.
Script for finding duplicate image files created by picasa import.
#!/usr/bin/env python
# Picasa sometimes duplicates imports and it adds a -NNN
# suffix (older versions seemed to use -N). This code
# finds files with this naming scheme, and looks for the base
# file in the same dirctory. If it finds the base, and the
# base has the same length, same md5 hash, the duplicate
# is suggested for deletion.
# Output is a set of 'rm <filename>' commands that can
# be run as a script.
import os
import re
import hashlib
def md5Checksum(filePath):
with open(filePath, 'rb') as fh:
m = hashlib.md5()
while True:
data = fh.read(8192)
if not data:
break
m.update(data)
return m.hexdigest()
dupBytes = 0
dupFiles = 0
totalFiles = 0
for root, dirs, files in os.walk('.'):
for name in files:
totalFiles += 1
result = re.match('(.*)\-\d+(\..*)', name)
if result:
file1 = os.path.join(root,name)
file0 = os.path.join(root,result.group(1)+result.group(2))
if os.path.exists(file0):
size0 = os.path.getsize(file0)
size1 = os.path.getsize(file1)
if size0 == size1:
if md5Checksum(file0)==md5Checksum(file1):
print "rm %s" % file1
dupBytes += size0
dupFiles += 1
print "# %d of %d files are duplicates." % (dupFiles, totalFiles)
print "# Total bytes in duplicates: %d" % dupBytes
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment