Skip to content

Instantly share code, notes, and snippets.

@moshekaplan
Last active November 15, 2016 14:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save moshekaplan/555274a4dd06cdcaaf5fb577b8d51571 to your computer and use it in GitHub Desktop.
Save moshekaplan/555274a4dd06cdcaaf5fb577b8d51571 to your computer and use it in GitHub Desktop.
Extract features from MSG files. Based on https://github.com/mattgwwalker/msg-extractor/blob/master/ExtractMsg.py Raw
#!/usr/bin/env python
# -*- coding: latin-1 -*-
"""
ExtractMsg:
Extracts emails and attachments saved in Microsoft Outlook's .msg files
https://github.com/mattgwwalker/msg-extractor
"""
__author__ = "Matthew Walker"
__date__ = "2016-10-09"
__version__ = '0.3'
# --- LICENSE -----------------------------------------------------------------
#
# Copyright 2013 Matthew Walker
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import sys
import glob
import traceback
from email.parser import Parser as EmailParser
import email.utils
import olefile as OleFile
# This property information was sourced from
# http://www.fileformat.info/format/outlookmsg/index.htm
# on 2013-07-22.
properties = {
'001A': 'Message class',
'0037': 'Subject',
'003D': 'Subject prefix',
'0040': 'Received by name',
'0042': 'Sent repr name',
'0044': 'Rcvd repr name',
'004D': 'Org author name',
'0050': 'Reply rcipnt names',
'005A': 'Org sender name',
'0064': 'Sent repr adrtype',
'0065': 'Sent repr email',
'0070': 'Topic',
'0075': 'Rcvd by adrtype',
'0076': 'Rcvd by email',
'0077': 'Repr adrtype',
'0078': 'Repr email',
'007d': 'Message header',
'0C1A': 'Sender name',
'0C1E': 'Sender adr type',
'0C1F': 'Sender email',
'0E02': 'Display BCC',
'0E03': 'Display CC',
'0E04': 'Display To',
'0E1D': 'Subject (normalized)',
'0E28': 'Recvd account1 (uncertain)',
'0E29': 'Recvd account2 (uncertain)',
'1000': 'Message body',
'1008': 'RTF sync body tag',
'1035': 'Message ID (uncertain)',
'1046': 'Sender email (uncertain)',
'3001': 'Display name',
'3002': 'Address type',
'3003': 'Email address',
'39FE': '7-bit email (uncertain)',
'39FF': '7-bit display name',
# Attachments (37xx)
'3701': 'Attachment data',
'3703': 'Attachment extension',
'3704': 'Attachment short filename',
'3707': 'Attachment long filename',
'370E': 'Attachment mime tag',
'3712': 'Attachment ID (uncertain)',
# Address book (3Axx):
'3A00': 'Account',
'3A02': 'Callback phone no',
'3A05': 'Generation',
'3A06': 'Given name',
'3A08': 'Business phone',
'3A09': 'Home phone',
'3A0A': 'Initials',
'3A0B': 'Keyword',
'3A0C': 'Language',
'3A0D': 'Location',
'3A11': 'Surname',
'3A15': 'Postal address',
'3A16': 'Company name',
'3A17': 'Title',
'3A18': 'Department',
'3A19': 'Office location',
'3A1A': 'Primary phone',
'3A1B': 'Business phone 2',
'3A1C': 'Mobile phone',
'3A1D': 'Radio phone no',
'3A1E': 'Car phone no',
'3A1F': 'Other phone',
'3A20': 'Transmit dispname',
'3A21': 'Pager',
'3A22': 'User certificate',
'3A23': 'Primary Fax',
'3A24': 'Business Fax',
'3A25': 'Home Fax',
'3A26': 'Country',
'3A27': 'Locality',
'3A28': 'State/Province',
'3A29': 'Street address',
'3A2A': 'Postal Code',
'3A2B': 'Post Office Box',
'3A2C': 'Telex',
'3A2D': 'ISDN',
'3A2E': 'Assistant phone',
'3A2F': 'Home phone 2',
'3A30': 'Assistant',
'3A44': 'Middle name',
'3A45': 'Dispname prefix',
'3A46': 'Profession',
'3A48': 'Spouse name',
'3A4B': 'TTYTTD radio phone',
'3A4C': 'FTP site',
'3A4E': 'Manager name',
'3A4F': 'Nickname',
'3A51': 'Business homepage',
'3A57': 'Company main phone',
'3A58': 'Childrens names',
'3A59': 'Home City',
'3A5A': 'Home Country',
'3A5B': 'Home Postal Code',
'3A5C': 'Home State/Provnce',
'3A5D': 'Home Street',
'3A5F': 'Other adr City',
'3A60': 'Other adr Country',
'3A61': 'Other adr PostCode',
'3A62': 'Other adr Province',
'3A63': 'Other adr Street',
'3A64': 'Other adr PO box',
'3FF7': 'Server (uncertain)',
'3FF8': 'Creator1 (uncertain)',
'3FFA': 'Creator2 (uncertain)',
'3FFC': 'To email (uncertain)',
'403D': 'To adrtype (uncertain)',
'403E': 'To email (uncertain)',
'5FF6': 'To (uncertain)'}
def windowsUnicode(string):
if string is None:
return None
if sys.version_info[0] >= 3: # Python 3
return str(string, 'utf_16_le')
else: # Python 2
return unicode(string, 'utf_16_le')
class Attachment:
def __init__(self, msg, dir_):
# Get long filename
self.longFilename = msg._getStringStream([dir_, '__substg1.0_3707'])
# Get short filename
self.shortFilename = msg._getStringStream([dir_, '__substg1.0_3704'])
# Get attachment data
self.data = msg._getStream([dir_, '__substg1.0_37010102'])
def save(self):
# Use long filename as first preference
filename = self.longFilename
# Otherwise use the short filename
if filename is None:
filename = self.shortFilename
# Otherwise just make something up!
if filename is None:
import random
import string
filename = 'UnknownFilename ' + \
''.join(random.choice(string.ascii_uppercase + string.digits)
for _ in range(5)) + ".bin"
f = open(filename, 'wb')
f.write(self.data)
f.close()
return filename
class Message(OleFile.OleFileIO):
def __init__(self, filename):
OleFile.OleFileIO.__init__(self, filename)
def _getStream(self, filename):
if self.exists(filename):
stream = self.openstream(filename)
return stream.read()
else:
return None
def _getStringStream(self, filename, prefer='unicode'):
"""Gets a string representation of the requested filename.
Checks for both ASCII and Unicode representations and returns
a value if possible. If there are both ASCII and Unicode
versions, then the parameter /prefer/ specifies which will be
returned.
"""
if isinstance(filename, list):
# Join with slashes to make it easier to append the type
filename = "/".join(filename)
asciiVersion = self._getStream(filename + '001E')
unicodeVersion = windowsUnicode(self._getStream(filename + '001F'))
if asciiVersion is None:
return unicodeVersion
elif unicodeVersion is None:
return asciiVersion
else:
if prefer == 'unicode':
return unicodeVersion
else:
return asciiVersion
@property
def subject(self):
return self._getStringStream('__substg1.0_0037')
@property
def header(self):
try:
return self._header
except Exception:
headerText = self._getStringStream('__substg1.0_007D')
if headerText is not None:
self._header = EmailParser().parsestr(headerText)
else:
self._header = None
return self._header
@property
def date(self):
# Get the message's header and extract the date
if self.header is None:
return None
else:
return self.header['date']
@property
def parsedDate(self):
return email.utils.parsedate(self.date)
@property
def sender(self):
try:
return self._sender
except Exception:
# Check header first
if self.header is not None:
headerResult = self.header["from"]
if headerResult is not None:
self._sender = headerResult
return headerResult
# Extract from other fields
text = self._getStringStream('__substg1.0_0C1A')
email = self._getStringStream('__substg1.0_0C1F')
result = None
if text is None:
result = email
else:
result = text
if email is not None:
result = result + " <" + email + ">"
self._sender = result
return result
@property
def to(self):
try:
return self._to
except Exception:
# Check header first
if self.header is not None:
headerResult = self.header["to"]
if headerResult is not None:
self._to = headerResult
return headerResult
# Extract from other fields
# TODO: This should really extract data from the recip folders,
# but how do you know which is to/cc/bcc?
display = self._getStringStream('__substg1.0_0E04')
self._to = display
return display
@property
def cc(self):
try:
return self._cc
except Exception:
# Check header first
if self.header is not None:
headerResult = self.header["cc"]
if headerResult is not None:
self._cc = headerResult
return headerResult
# Extract from other fields
# TODO: This should really extract data from the recip folders,
# but how do you know which is to/cc/bcc?
display = self._getStringStream('__substg1.0_0E03')
self._cc = display
return display
@property
def body(self):
# Get the message body
return self._getStringStream('__substg1.0_1000')
@property
def attachments(self):
try:
return self._attachments
except Exception:
# Get the attachments
attachmentDirs = []
for dir_ in self.listdir():
if dir_[0].startswith('__attach') and dir_[0] not in attachmentDirs:
attachmentDirs.append(dir_[0])
self._attachments = []
for attachmentDir in attachmentDirs:
self._attachments.append(Attachment(self, attachmentDir))
return self._attachments
def save(self, toJson=False, useFileName=False, raw=False):
'''Saves the message body and attachments found in the message. Setting toJson
to true will output the message body as JSON-formatted text. The body and
attachments are stored in a folder. Setting useFileName to true will mean that
the filename is used as the name of the folder; otherwise, the message's date
and subject are used as the folder name.'''
if useFileName:
# strip out the extension
dirName = filename.split('/').pop().split('.')[0]
else:
# Create a directory based on the date and subject of the message
d = self.parsedDate
if d is not None:
dirName = '{0:02d}-{1:02d}-{2:02d}_{3:02d}{4:02d}'.format(*d)
else:
dirName = "UnknownDate"
if self.subject is None:
subject = "[No subject]"
else:
subject = "".join(i for i in self.subject if i not in r'\/:*?"<>|')
dirName = dirName + " " + subject
def addNumToDir(dirName):
# Attempt to create the directory with a '(n)' appended
for i in range(2, 100):
try:
newDirName = dirName + " (" + str(i) + ")"
os.makedirs(newDirName)
return newDirName
except Exception:
pass
return None
try:
os.makedirs(dirName)
except Exception:
newDirName = addNumToDir(dirName)
if newDirName is not None:
dirName = newDirName
else:
raise Exception(
"Failed to create directory '%s'. Does it already exist?" %
dirName
)
oldDir = os.getcwd()
try:
os.chdir(dirName)
# Save the message body
fext = 'json' if toJson else 'text'
f = open("message." + fext, "w")
# From, to , cc, subject, date
def xstr(s):
return '' if s is None else str(s)
attachmentNames = []
# Save the attachments
for attachment in self.attachments:
attachmentNames.append(attachment.save())
if toJson:
import json
from imapclient.imapclient import decode_utf7
emailObj = {'from': xstr(self.sender),
'to': xstr(self.to),
'cc': xstr(self.cc),
'subject': xstr(self.subject),
'date': xstr(self.date),
'attachments': attachmentNames,
'body': decode_utf7(self.body)}
f.write(json.dumps(emailObj, ensure_ascii=True))
else:
f.write("From: " + xstr(self.sender) + "\n")
f.write("To: " + xstr(self.to) + "\n")
f.write("CC: " + xstr(self.cc) + "\n")
f.write("Subject: " + xstr(self.subject) + "\n")
f.write("Date: " + xstr(self.date) + "\n")
f.write("-----------------\n\n")
f.write(self.body)
f.close()
except Exception:
self.saveRaw()
raise
finally:
# Return to previous directory
os.chdir(oldDir)
def saveRaw(self):
# Create a 'raw' folder
oldDir = os.getcwd()
try:
rawDir = "raw"
os.makedirs(rawDir)
os.chdir(rawDir)
sysRawDir = os.getcwd()
# Loop through all the directories
for dir_ in self.listdir():
sysdir = "/".join(dir_)
code = dir_[-1][-8:-4]
global properties
if code in properties:
sysdir = sysdir + " - " + properties[code]
os.makedirs(sysdir)
os.chdir(sysdir)
# Generate appropriate filename
if dir_[-1].endswith("001E"):
filename = "contents.txt"
else:
filename = "contents"
# Save contents of directory
f = open(filename, 'wb')
f.write(self._getStream(dir_))
f.close()
# Return to base directory
os.chdir(sysRawDir)
finally:
os.chdir(oldDir)
def dump(self):
# Prints out a summary of the message
print('Message')
print('Subject:', self.subject)
print('Date:', self.date)
print('Body:')
print(self.body)
def debug(self):
for dir_ in self.listdir():
if dir_[-1].endswith('001E'): # FIXME: Check for unicode 001F too
print("Directory: " + str(dir))
print("Contents: " + self._getStream(dir))
def save_attachments(self, raw=False):
"""Saves only attachments in the same folder.
"""
for attachment in self.attachments:
attachment.save()
import hashlib
def main():
if len(sys.argv) < 2:
print "USAGE: %s %s <filename>" % (sys.executable, sys.argv[0])
sys.exit(1)
fpath = sys.argv[1]
msg = Message(fpath)
print
print "--Email Info--"
print "From:", msg.sender
print "Subject:", msg.subject
print "Date:", msg.date
print
if msg.attachments:
print "--Files--"
for attachment in msg.attachments:
filename = attachment.longFilename
# Otherwise use the short filename
if filename is None:
filename = attachment.shortFilename
print "Filename:", filename
print "MD5:", hashlib.md5(attachment.data).hexdigest()
print
print
import re
print
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-;=@-_-&/\?$_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', msg.body.encode('utf-8'))
if urls:
print "--Links--"
print "\n".join(urls)
print
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment