Skip to content

Instantly share code, notes, and snippets.

@lewisthompson
Created January 19, 2019 19:59
Show Gist options
  • Save lewisthompson/a982729a4a3c2ffa7c9e7004e8f5b41e to your computer and use it in GitHub Desktop.
Save lewisthompson/a982729a4a3c2ffa7c9e7004e8f5b41e to your computer and use it in GitHub Desktop.
Part of dedupe_maildir.sh
#!/usr/bin/env python
import sys
tuids = {}
xtuids_filename = sys.argv[1]
dupes1_filename = sys.argv[2]
dupes_filename = sys.argv[3]
origs_filename = sys.argv[4]
with open(xtuids_filename, 'rb') as f:
for tuid in f.readlines():
tuids[tuid.strip()] = True
dupe_count = 0
orig_count = 0
dupes_without_origs = 0
with open(dupes_filename, 'w') as df:
with open(origs_filename, 'w') as of:
with open(dupes1_filename, 'rb') as f:
for dupes in f.readlines():
dupes = dupes.split()
dupe_without_orig = True
for dupe in dupes:
if dupe in tuids:
df.writelines(dupe + '\n')
dupe_count += 1
else:
dupe_without_orig = False
of.writelines(dupe + '\n')
orig_count += 1
if dupe_without_orig:
dupes_without_origs += 1
print 'found {0} dupes, {1} origs & {2} dupes without originals'.format(dupe_count, orig_count, dupes_without_origs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment