Created
January 23, 2010 08:20
-
-
Save lkraav/284503 to your computer and use it in GitHub Desktop.
kill-outlook-eml-duplicates.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import os | |
from termcolor import colored | |
rootdir = 'Documents/thunderbird/tbird2oe/1-LocalFolders/localhost-Inbox-pruned' | |
md5 = {} | |
for subdir, dirs, files in os.walk(rootdir): | |
ii = 0 | |
print colored("FOUND: %d e-mails" % len(files), "cyan") | |
for file in files: | |
ii = ii + 1 | |
# We want to ignore the first From - line which is often different for same EMLs downloaded at different times | |
#p = subprocess.Popen("tail -n +2 %s/%s | md5sum -" % (rootdir, file), shell = True, stdout=subprocess.PIPE) | |
p = subprocess.Popen("egrep '^Date:|^From:|^Subject:' %s/%s |md5sum -" % (rootdir, file), shell = True, stdout=subprocess.PIPE) | |
md5sum = p.stdout.readline().split()[0] | |
md5.setdefault(md5sum, []).append(file) # now the md5 variable contains the MD5 sum | |
p.wait() # some clean up | |
# if ii > 1000: break | |
dups = len(md5[md5sum]) | |
# if dups > 1: print "%s: %s e-mails" % (md5sum, dups) | |
print "Duplicate hash find: %d files" % ii | |
delcount = 0 | |
for k, v in md5.items(): | |
if len(v) > 1: | |
print "%s Count: %d Deleting: %d duplicates" % (k, len(v), len(v[1:])) | |
print v | |
while len(v) > 1: | |
tbdeleted = v.pop() | |
print "DELETE: %s" % tbdeleted | |
os.remove('%s/%s' % (rootdir, tbdeleted)) | |
delcount = delcount + 1 | |
p = subprocess.Popen("egrep '^Date:|^From:|^Subject:' %s/%s" % (rootdir, v[0]), shell = True, stdout=subprocess.PIPE) | |
print colored('\t%s' % (p.communicate()[0].replace('\n', '\n\t')), 'yellow') | |
print '-' * 64 | |
print colored("TOTAL: Deleted %d duplicates" % delcount, "red") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment