Skip to content

Instantly share code, notes, and snippets.

@akaihola
Created September 13, 2010 12:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save akaihola/577229 to your computer and use it in GitHub Desktop.
Save akaihola/577229 to your computer and use it in GitHub Desktop.
This is a script I used to clean up the havoc Thunderbird caused by copying thousands of messages from INBOX as five copies to a folder
"""Helpers for removing duplicate messages in an IMAP mailbox
Dependency: IMAPClient (easy_install IMAPClient)
Author: Antti Kaihola <akaihol+python@ambitone.com>
License: New BSD license
"""
import collections
import imapclient
class IMAPDeDupper(object):
def __init__(self, client):
self.client = client
def get_all_messages(self):
uids = self.client.search()
return self.client.fetch(uids, ('internaldate', 'envelope',))
def make_fingerprint(self, msg):
return msg['INTERNALDATE'], msg['ENVELOPE'][:9]
def group_messages_by_fingerprint(self):
"""Returns a dict grouping duplicate messages in current folder
Message fingerprints as returned by ``make_fingerprint`` are
used as dict keys, and lists of message UIDs as dict values.
"""
msgs = self.get_all_messages()
grouped = collections.defaultdict(list)
for uid, msg in msgs.iteritems():
fingerprint = self.make_fingerprint(msg)
grouped[fingerprint].append(uid)
return grouped
def get_duplicates(self):
"""Returns UIDs of duplicate and unique messages in the current folder
Return value: a 2-tuple of lists ``([<duplicates>], [<uniques>])``
For sets of duplicate messages, the first message is not
included in the lists, and following messages are added to
the ``duplicates`` list.
"""
grouped = self.group_messages_by_fingerprint()
uniques = []
duplicates = []
for fingerprint, uids in grouped.iteritems():
if len(uids) > 1:
duplicates.extend(uids[1:])
else:
uniques.append(uids[0])
return duplicates, uniques
def get_matching(self, fingerprints):
"""Deletes matching messages in the current folder
Given a list (or dict with keys) of message fingerprints,
searches the current folder for messages matching any of the
fingerprints.
Return value: a 2-tuple of UID lists
``([<matching>], [<nonmatching>])``
"""
grouped = self.group_messages_by_fingerprint()
matching = []
nonmatching = []
for fingerprint, uids in grouped.iteritems():
if fingerprint in fingerprints:
matching.extend(uids)
else:
nonmatching.extend(uids)
return matching, nonmatching
def example():
c = imapclient.IMAPClient('imap.mydomain.com')
c.login('login', 'password')
d = IMAPDeDupper(c)
# delete duplicate messages in folder ``2009``
c.select_folder('2009')
dups, uniques = d.get_duplicates()
print 'In folder "2009", deleting message UIDS:\n%s' % repr(dups)
c.delete_messages(dups)
c.expunge()
# delete inbox messages which exist in folder ``2009``
c.select_folder('2009')
twoten = d.group_messages_by_fingerprint()
c.select_folder('INBOX')
dups, uniqs = d.get_matching(twoten)
print 'In the inbox, deleting message UIDS:\n%s' % repr(dups)
c.delete_messages(dups)
c.expunge()
if __name__ == '__main__':
pass
# TODO: implement command line interface
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment