Last active
November 15, 2016 14:28
-
-
Save moshekaplan/555274a4dd06cdcaaf5fb577b8d51571 to your computer and use it in GitHub Desktop.
Extract features from MSG files. Based on https://github.com/mattgwwalker/msg-extractor/blob/master/ExtractMsg.py Raw
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: latin-1 -*- | |
""" | |
ExtractMsg: | |
Extracts emails and attachments saved in Microsoft Outlook's .msg files | |
https://github.com/mattgwwalker/msg-extractor | |
""" | |
__author__ = "Matthew Walker" | |
__date__ = "2016-10-09" | |
__version__ = '0.3' | |
# --- LICENSE ----------------------------------------------------------------- | |
# | |
# Copyright 2013 Matthew Walker | |
# | |
# This program is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation, either version 3 of the License, or | |
# (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
import os | |
import sys | |
import glob | |
import traceback | |
from email.parser import Parser as EmailParser | |
import email.utils | |
import olefile as OleFile | |
# This property information was sourced from | |
# http://www.fileformat.info/format/outlookmsg/index.htm | |
# on 2013-07-22. | |
properties = { | |
'001A': 'Message class', | |
'0037': 'Subject', | |
'003D': 'Subject prefix', | |
'0040': 'Received by name', | |
'0042': 'Sent repr name', | |
'0044': 'Rcvd repr name', | |
'004D': 'Org author name', | |
'0050': 'Reply rcipnt names', | |
'005A': 'Org sender name', | |
'0064': 'Sent repr adrtype', | |
'0065': 'Sent repr email', | |
'0070': 'Topic', | |
'0075': 'Rcvd by adrtype', | |
'0076': 'Rcvd by email', | |
'0077': 'Repr adrtype', | |
'0078': 'Repr email', | |
'007d': 'Message header', | |
'0C1A': 'Sender name', | |
'0C1E': 'Sender adr type', | |
'0C1F': 'Sender email', | |
'0E02': 'Display BCC', | |
'0E03': 'Display CC', | |
'0E04': 'Display To', | |
'0E1D': 'Subject (normalized)', | |
'0E28': 'Recvd account1 (uncertain)', | |
'0E29': 'Recvd account2 (uncertain)', | |
'1000': 'Message body', | |
'1008': 'RTF sync body tag', | |
'1035': 'Message ID (uncertain)', | |
'1046': 'Sender email (uncertain)', | |
'3001': 'Display name', | |
'3002': 'Address type', | |
'3003': 'Email address', | |
'39FE': '7-bit email (uncertain)', | |
'39FF': '7-bit display name', | |
# Attachments (37xx) | |
'3701': 'Attachment data', | |
'3703': 'Attachment extension', | |
'3704': 'Attachment short filename', | |
'3707': 'Attachment long filename', | |
'370E': 'Attachment mime tag', | |
'3712': 'Attachment ID (uncertain)', | |
# Address book (3Axx): | |
'3A00': 'Account', | |
'3A02': 'Callback phone no', | |
'3A05': 'Generation', | |
'3A06': 'Given name', | |
'3A08': 'Business phone', | |
'3A09': 'Home phone', | |
'3A0A': 'Initials', | |
'3A0B': 'Keyword', | |
'3A0C': 'Language', | |
'3A0D': 'Location', | |
'3A11': 'Surname', | |
'3A15': 'Postal address', | |
'3A16': 'Company name', | |
'3A17': 'Title', | |
'3A18': 'Department', | |
'3A19': 'Office location', | |
'3A1A': 'Primary phone', | |
'3A1B': 'Business phone 2', | |
'3A1C': 'Mobile phone', | |
'3A1D': 'Radio phone no', | |
'3A1E': 'Car phone no', | |
'3A1F': 'Other phone', | |
'3A20': 'Transmit dispname', | |
'3A21': 'Pager', | |
'3A22': 'User certificate', | |
'3A23': 'Primary Fax', | |
'3A24': 'Business Fax', | |
'3A25': 'Home Fax', | |
'3A26': 'Country', | |
'3A27': 'Locality', | |
'3A28': 'State/Province', | |
'3A29': 'Street address', | |
'3A2A': 'Postal Code', | |
'3A2B': 'Post Office Box', | |
'3A2C': 'Telex', | |
'3A2D': 'ISDN', | |
'3A2E': 'Assistant phone', | |
'3A2F': 'Home phone 2', | |
'3A30': 'Assistant', | |
'3A44': 'Middle name', | |
'3A45': 'Dispname prefix', | |
'3A46': 'Profession', | |
'3A48': 'Spouse name', | |
'3A4B': 'TTYTTD radio phone', | |
'3A4C': 'FTP site', | |
'3A4E': 'Manager name', | |
'3A4F': 'Nickname', | |
'3A51': 'Business homepage', | |
'3A57': 'Company main phone', | |
'3A58': 'Childrens names', | |
'3A59': 'Home City', | |
'3A5A': 'Home Country', | |
'3A5B': 'Home Postal Code', | |
'3A5C': 'Home State/Provnce', | |
'3A5D': 'Home Street', | |
'3A5F': 'Other adr City', | |
'3A60': 'Other adr Country', | |
'3A61': 'Other adr PostCode', | |
'3A62': 'Other adr Province', | |
'3A63': 'Other adr Street', | |
'3A64': 'Other adr PO box', | |
'3FF7': 'Server (uncertain)', | |
'3FF8': 'Creator1 (uncertain)', | |
'3FFA': 'Creator2 (uncertain)', | |
'3FFC': 'To email (uncertain)', | |
'403D': 'To adrtype (uncertain)', | |
'403E': 'To email (uncertain)', | |
'5FF6': 'To (uncertain)'} | |
def windowsUnicode(string): | |
if string is None: | |
return None | |
if sys.version_info[0] >= 3: # Python 3 | |
return str(string, 'utf_16_le') | |
else: # Python 2 | |
return unicode(string, 'utf_16_le') | |
class Attachment: | |
def __init__(self, msg, dir_): | |
# Get long filename | |
self.longFilename = msg._getStringStream([dir_, '__substg1.0_3707']) | |
# Get short filename | |
self.shortFilename = msg._getStringStream([dir_, '__substg1.0_3704']) | |
# Get attachment data | |
self.data = msg._getStream([dir_, '__substg1.0_37010102']) | |
def save(self): | |
# Use long filename as first preference | |
filename = self.longFilename | |
# Otherwise use the short filename | |
if filename is None: | |
filename = self.shortFilename | |
# Otherwise just make something up! | |
if filename is None: | |
import random | |
import string | |
filename = 'UnknownFilename ' + \ | |
''.join(random.choice(string.ascii_uppercase + string.digits) | |
for _ in range(5)) + ".bin" | |
f = open(filename, 'wb') | |
f.write(self.data) | |
f.close() | |
return filename | |
class Message(OleFile.OleFileIO): | |
def __init__(self, filename): | |
OleFile.OleFileIO.__init__(self, filename) | |
def _getStream(self, filename): | |
if self.exists(filename): | |
stream = self.openstream(filename) | |
return stream.read() | |
else: | |
return None | |
def _getStringStream(self, filename, prefer='unicode'): | |
"""Gets a string representation of the requested filename. | |
Checks for both ASCII and Unicode representations and returns | |
a value if possible. If there are both ASCII and Unicode | |
versions, then the parameter /prefer/ specifies which will be | |
returned. | |
""" | |
if isinstance(filename, list): | |
# Join with slashes to make it easier to append the type | |
filename = "/".join(filename) | |
asciiVersion = self._getStream(filename + '001E') | |
unicodeVersion = windowsUnicode(self._getStream(filename + '001F')) | |
if asciiVersion is None: | |
return unicodeVersion | |
elif unicodeVersion is None: | |
return asciiVersion | |
else: | |
if prefer == 'unicode': | |
return unicodeVersion | |
else: | |
return asciiVersion | |
@property | |
def subject(self): | |
return self._getStringStream('__substg1.0_0037') | |
@property | |
def header(self): | |
try: | |
return self._header | |
except Exception: | |
headerText = self._getStringStream('__substg1.0_007D') | |
if headerText is not None: | |
self._header = EmailParser().parsestr(headerText) | |
else: | |
self._header = None | |
return self._header | |
@property | |
def date(self): | |
# Get the message's header and extract the date | |
if self.header is None: | |
return None | |
else: | |
return self.header['date'] | |
@property | |
def parsedDate(self): | |
return email.utils.parsedate(self.date) | |
@property | |
def sender(self): | |
try: | |
return self._sender | |
except Exception: | |
# Check header first | |
if self.header is not None: | |
headerResult = self.header["from"] | |
if headerResult is not None: | |
self._sender = headerResult | |
return headerResult | |
# Extract from other fields | |
text = self._getStringStream('__substg1.0_0C1A') | |
email = self._getStringStream('__substg1.0_0C1F') | |
result = None | |
if text is None: | |
result = email | |
else: | |
result = text | |
if email is not None: | |
result = result + " <" + email + ">" | |
self._sender = result | |
return result | |
@property | |
def to(self): | |
try: | |
return self._to | |
except Exception: | |
# Check header first | |
if self.header is not None: | |
headerResult = self.header["to"] | |
if headerResult is not None: | |
self._to = headerResult | |
return headerResult | |
# Extract from other fields | |
# TODO: This should really extract data from the recip folders, | |
# but how do you know which is to/cc/bcc? | |
display = self._getStringStream('__substg1.0_0E04') | |
self._to = display | |
return display | |
@property | |
def cc(self): | |
try: | |
return self._cc | |
except Exception: | |
# Check header first | |
if self.header is not None: | |
headerResult = self.header["cc"] | |
if headerResult is not None: | |
self._cc = headerResult | |
return headerResult | |
# Extract from other fields | |
# TODO: This should really extract data from the recip folders, | |
# but how do you know which is to/cc/bcc? | |
display = self._getStringStream('__substg1.0_0E03') | |
self._cc = display | |
return display | |
@property | |
def body(self): | |
# Get the message body | |
return self._getStringStream('__substg1.0_1000') | |
@property | |
def attachments(self): | |
try: | |
return self._attachments | |
except Exception: | |
# Get the attachments | |
attachmentDirs = [] | |
for dir_ in self.listdir(): | |
if dir_[0].startswith('__attach') and dir_[0] not in attachmentDirs: | |
attachmentDirs.append(dir_[0]) | |
self._attachments = [] | |
for attachmentDir in attachmentDirs: | |
self._attachments.append(Attachment(self, attachmentDir)) | |
return self._attachments | |
def save(self, toJson=False, useFileName=False, raw=False): | |
'''Saves the message body and attachments found in the message. Setting toJson | |
to true will output the message body as JSON-formatted text. The body and | |
attachments are stored in a folder. Setting useFileName to true will mean that | |
the filename is used as the name of the folder; otherwise, the message's date | |
and subject are used as the folder name.''' | |
if useFileName: | |
# strip out the extension | |
dirName = filename.split('/').pop().split('.')[0] | |
else: | |
# Create a directory based on the date and subject of the message | |
d = self.parsedDate | |
if d is not None: | |
dirName = '{0:02d}-{1:02d}-{2:02d}_{3:02d}{4:02d}'.format(*d) | |
else: | |
dirName = "UnknownDate" | |
if self.subject is None: | |
subject = "[No subject]" | |
else: | |
subject = "".join(i for i in self.subject if i not in r'\/:*?"<>|') | |
dirName = dirName + " " + subject | |
def addNumToDir(dirName): | |
# Attempt to create the directory with a '(n)' appended | |
for i in range(2, 100): | |
try: | |
newDirName = dirName + " (" + str(i) + ")" | |
os.makedirs(newDirName) | |
return newDirName | |
except Exception: | |
pass | |
return None | |
try: | |
os.makedirs(dirName) | |
except Exception: | |
newDirName = addNumToDir(dirName) | |
if newDirName is not None: | |
dirName = newDirName | |
else: | |
raise Exception( | |
"Failed to create directory '%s'. Does it already exist?" % | |
dirName | |
) | |
oldDir = os.getcwd() | |
try: | |
os.chdir(dirName) | |
# Save the message body | |
fext = 'json' if toJson else 'text' | |
f = open("message." + fext, "w") | |
# From, to , cc, subject, date | |
def xstr(s): | |
return '' if s is None else str(s) | |
attachmentNames = [] | |
# Save the attachments | |
for attachment in self.attachments: | |
attachmentNames.append(attachment.save()) | |
if toJson: | |
import json | |
from imapclient.imapclient import decode_utf7 | |
emailObj = {'from': xstr(self.sender), | |
'to': xstr(self.to), | |
'cc': xstr(self.cc), | |
'subject': xstr(self.subject), | |
'date': xstr(self.date), | |
'attachments': attachmentNames, | |
'body': decode_utf7(self.body)} | |
f.write(json.dumps(emailObj, ensure_ascii=True)) | |
else: | |
f.write("From: " + xstr(self.sender) + "\n") | |
f.write("To: " + xstr(self.to) + "\n") | |
f.write("CC: " + xstr(self.cc) + "\n") | |
f.write("Subject: " + xstr(self.subject) + "\n") | |
f.write("Date: " + xstr(self.date) + "\n") | |
f.write("-----------------\n\n") | |
f.write(self.body) | |
f.close() | |
except Exception: | |
self.saveRaw() | |
raise | |
finally: | |
# Return to previous directory | |
os.chdir(oldDir) | |
def saveRaw(self): | |
# Create a 'raw' folder | |
oldDir = os.getcwd() | |
try: | |
rawDir = "raw" | |
os.makedirs(rawDir) | |
os.chdir(rawDir) | |
sysRawDir = os.getcwd() | |
# Loop through all the directories | |
for dir_ in self.listdir(): | |
sysdir = "/".join(dir_) | |
code = dir_[-1][-8:-4] | |
global properties | |
if code in properties: | |
sysdir = sysdir + " - " + properties[code] | |
os.makedirs(sysdir) | |
os.chdir(sysdir) | |
# Generate appropriate filename | |
if dir_[-1].endswith("001E"): | |
filename = "contents.txt" | |
else: | |
filename = "contents" | |
# Save contents of directory | |
f = open(filename, 'wb') | |
f.write(self._getStream(dir_)) | |
f.close() | |
# Return to base directory | |
os.chdir(sysRawDir) | |
finally: | |
os.chdir(oldDir) | |
def dump(self): | |
# Prints out a summary of the message | |
print('Message') | |
print('Subject:', self.subject) | |
print('Date:', self.date) | |
print('Body:') | |
print(self.body) | |
def debug(self): | |
for dir_ in self.listdir(): | |
if dir_[-1].endswith('001E'): # FIXME: Check for unicode 001F too | |
print("Directory: " + str(dir)) | |
print("Contents: " + self._getStream(dir)) | |
def save_attachments(self, raw=False): | |
"""Saves only attachments in the same folder. | |
""" | |
for attachment in self.attachments: | |
attachment.save() | |
import hashlib | |
def main(): | |
if len(sys.argv) < 2: | |
print "USAGE: %s %s <filename>" % (sys.executable, sys.argv[0]) | |
sys.exit(1) | |
fpath = sys.argv[1] | |
msg = Message(fpath) | |
print "--Email Info--" | |
print "From:", msg.sender | |
print "Subject:", msg.subject | |
print "Date:", msg.date | |
if msg.attachments: | |
print "--Files--" | |
for attachment in msg.attachments: | |
filename = attachment.longFilename | |
# Otherwise use the short filename | |
if filename is None: | |
filename = attachment.shortFilename | |
print "Filename:", filename | |
print "MD5:", hashlib.md5(attachment.data).hexdigest() | |
import re | |
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-;=@-_-&/\?$_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', msg.body.encode('utf-8')) | |
if urls: | |
print "--Links--" | |
print "\n".join(urls) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment