Created January 23, 2010 08:20
import subprocess
import os
from termcolor import colored
rootdir = 'Documents/thunderbird/tbird2oe/1-LocalFolders/localhost-Inbox-pruned'
md5 = {}
for subdir, dirs, files in os.walk(rootdir):
ii = 0
print colored("FOUND: %d e-mails" % len(files), "cyan")
for file in files:
ii = ii + 1
# We want to ignore the first From - line which is often different for same EMLs downloaded at different times
#p = subprocess.Popen("tail -n +2 %s/%s | md5sum -" % (rootdir, file), shell = True, stdout=subprocess.PIPE)
p = subprocess.Popen("egrep '^Date:|^From:|^Subject:' %s/%s |md5sum -" % (rootdir, file), shell = True, stdout=subprocess.PIPE)
md5sum = p.stdout.readline().split()[0]
md5.setdefault(md5sum, []).append(file) # now the md5 variable contains the MD5 sum
p.wait() # some clean up
# if ii > 1000: break
dups = len(md5[md5sum])
# if dups > 1: print "%s: %s e-mails" % (md5sum, dups)
print "Duplicate hash find: %d files" % ii
delcount = 0
for k, v in md5.items():
if len(v) > 1:
print "%s Count: %d Deleting: %d duplicates" % (k, len(v), len(v[1:]))
print v
while len(v) > 1:
tbdeleted = v.pop()
print "DELETE: %s" % tbdeleted
os.remove('%s/%s' % (rootdir, tbdeleted))
delcount = delcount + 1
p = subprocess.Popen("egrep '^Date:|^From:|^Subject:' %s/%s" % (rootdir, v[0]), shell = True, stdout=subprocess.PIPE)
print colored('\t%s' % (p.communicate()[0].replace('\n', '\n\t')), 'yellow')
print '-' * 64
print colored("TOTAL: Deleted %d duplicates" % delcount, "red")
