Skip to content

Instantly share code, notes, and snippets.

@hervenivon
Created June 25, 2021 12:35
Show Gist options
  • Save hervenivon/e2dcd9d73af34c0f4e136f715b31dd5a to your computer and use it in GitHub Desktop.
Save hervenivon/e2dcd9d73af34c0f4e136f715b31dd5a to your computer and use it in GitHub Desktop.
Outlook Mac `olm` archive contact extractor
#!/usr/bin/env python
import re
import sys
import xml.etree.ElementTree as ET
from collections import Counter
from zipfile import ZipFile
messageRe = re.compile('.*message_[0-9]{5}\.xml')
full_email_list = []
parsedEmails = 0
name = False
debug = False
blacklist = ['reply',
'noreply',
'no-reply',
'registration',
'chime',
'support@',
'-beta',
'.calendar',
'carlsonwagonlit.com',
'mailbox',
'account',
'marketing@',
'help@',
'sales']
with ZipFile('OutlookArchive.olm', 'r') as zipObj:
# Get list of files names in zip
fileList = zipObj.namelist()
# Iterate over the list of file names in given list & print them
for file in fileList:
# keep only email related files
if (messageRe.match(file)):
message = zipObj.read(file)
root = None
try:
root = ET.fromstring(message)
except:
print('"{0}" couldn\'t parsed'.format(file))
print('Unexpected error:', sys.exc_info()[0])
finally:
for item in root.iter('emailAddress'):
if 'OPFContactEmailAddressAddress' in item.attrib:
if any(to_check in item.attrib['OPFContactEmailAddressAddress'].lower() for to_check in blacklist):
continue
if 'OPFContactEmailAddressName' in item.attrib and name:
to_append = item.attrib['OPFContactEmailAddressName']+' <'+item.attrib['OPFContactEmailAddressAddress'].lower()+'>'
else:
to_append = item.attrib['OPFContactEmailAddressAddress'].lower()
full_email_list.append(to_append)
parsedEmails += 1
if parsedEmails == 100 and debug:
exit()
print('Parsed emails: {0}'.format(parsedEmails))
counts = Counter(full_email_list)
counts = dict(sorted(counts.items(), key=lambda x: x[1], reverse=True))
print(counts)
with open('extract.csv', 'wb') as f:
for key in counts.keys():
f.write('{0},{1}\n'.format(key, counts[key]).encode('utf8'))
# make every email unique
email_list = list(set(full_email_list))
with open('extract.txt', 'wb') as f:
f.write('\n'.join(email_list).encode('utf8'))
print('extraction completed')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment