Created
June 25, 2021 12:35
-
-
Save hervenivon/e2dcd9d73af34c0f4e136f715b31dd5a to your computer and use it in GitHub Desktop.
Outlook Mac `olm` archive contact extractor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import re | |
import sys | |
import xml.etree.ElementTree as ET | |
from collections import Counter | |
from zipfile import ZipFile | |
messageRe = re.compile('.*message_[0-9]{5}\.xml') | |
full_email_list = [] | |
parsedEmails = 0 | |
name = False | |
debug = False | |
blacklist = ['reply', | |
'noreply', | |
'no-reply', | |
'registration', | |
'chime', | |
'support@', | |
'-beta', | |
'.calendar', | |
'carlsonwagonlit.com', | |
'mailbox', | |
'account', | |
'marketing@', | |
'help@', | |
'sales'] | |
with ZipFile('OutlookArchive.olm', 'r') as zipObj: | |
# Get list of files names in zip | |
fileList = zipObj.namelist() | |
# Iterate over the list of file names in given list & print them | |
for file in fileList: | |
# keep only email related files | |
if (messageRe.match(file)): | |
message = zipObj.read(file) | |
root = None | |
try: | |
root = ET.fromstring(message) | |
except: | |
print('"{0}" couldn\'t parsed'.format(file)) | |
print('Unexpected error:', sys.exc_info()[0]) | |
finally: | |
for item in root.iter('emailAddress'): | |
if 'OPFContactEmailAddressAddress' in item.attrib: | |
if any(to_check in item.attrib['OPFContactEmailAddressAddress'].lower() for to_check in blacklist): | |
continue | |
if 'OPFContactEmailAddressName' in item.attrib and name: | |
to_append = item.attrib['OPFContactEmailAddressName']+' <'+item.attrib['OPFContactEmailAddressAddress'].lower()+'>' | |
else: | |
to_append = item.attrib['OPFContactEmailAddressAddress'].lower() | |
full_email_list.append(to_append) | |
parsedEmails += 1 | |
if parsedEmails == 100 and debug: | |
exit() | |
print('Parsed emails: {0}'.format(parsedEmails)) | |
counts = Counter(full_email_list) | |
counts = dict(sorted(counts.items(), key=lambda x: x[1], reverse=True)) | |
print(counts) | |
with open('extract.csv', 'wb') as f: | |
for key in counts.keys(): | |
f.write('{0},{1}\n'.format(key, counts[key]).encode('utf8')) | |
# make every email unique | |
email_list = list(set(full_email_list)) | |
with open('extract.txt', 'wb') as f: | |
f.write('\n'.join(email_list).encode('utf8')) | |
print('extraction completed') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment