Skip to content

Instantly share code, notes, and snippets.

@TimRepke
Last active July 11, 2023 12:01
Show Gist options
  • Save TimRepke/8061a4750c77a1c0045132098c91ea99 to your computer and use it in GitHub Desktop.
Save TimRepke/8061a4750c77a1c0045132098c91ea99 to your computer and use it in GitHub Desktop.
PST Archive to RFC822 (*.eml) script

PST Archive to RFC822

This script extracts all emails from an Outlook PST archive and saves them into some output folder as individual RFC822 compliant *.eml files.

Installing the external dependency pypff may not be straight forward (it wasn't for me). I forked the original repository to make it work in Python 3. If you get errors, check their wiki pages for help or try my fork. Below are the steps that worked for me:

Clone https://github.com/libyal/libpff/tree/master/pypff

cd libpff/
./synclibs.sh
./autogen.sh
./configure --enable-python
make
sudo make install
python setup.py build
sudo python setup.py install

Now that everything is installed, you can execute the script as follows: python pst2eml.py /path/to/archive.pst /path/to/output/dir

Optionally, you can write the log into a file by adding --logfile=/path/to/log_dir to the command.

Full disclaimer: I was inspired by this script, but as you may see, I pretty much threw everything overboard and made my own thing. Only kept the logging and argparse really.

import os
import argparse
import logging
import re
import pypff
def process_folder(folder, path):
folder_path = path + '/' + (folder.name or 'root')
n_msg = folder.number_of_sub_messages
logging.debug('Processing folder "{}" with {} sub-folders and {} messages; full path: "{}"'.format(
folder.name, folder.number_of_sub_folders, n_msg, folder_path))
safe_path = re.sub(r'[ /]', '_', re.sub(r'[^a-z0-9 /]', '', folder_path.lower()))
for mi, message in enumerate(folder.sub_messages):
logging.debug('{}/{} > Processing message by {} with subject: {}'.format(mi, n_msg,
message.sender_name, message.subject))
msg = process_message(message)
fname = os.path.join(output_directory, safe_path + '_' + str(mi) + '.eml')
logging.debug(' -- saving as {}'.format(fname))
with open(fname, 'w') as f:
f.write(msg)
for sub_folder in folder.sub_folders:
process_folder(sub_folder, folder_path)
def get_body(msg):
def prep(b):
if type(b) == bytes:
b = b.decode("utf-8")
return b.strip() if b else None
body = prep(msg.plain_text_body)
if body:
return 'plain-text', body
body = prep(msg.html_body)
if body:
return 'html', body
body = prep(msg.rtf_body)
if body:
return 'rtf', body
return 'plain-text', ''
def process_message(message):
msg = ''
keys = []
for hp in message.transport_headers.split('\n'):
pts = re.findall(r'^([^:]+): (.+)\r$', hp)
if pts:
key = pts[0][0].capitalize()
if key in keys:
key = 'X-' + key
keys.append(key)
val = pts[0][1]
if key == 'Date':
val = ', '.join(val.split(',')[:2])
msg += key + ': ' + val + '\r\n'
msg += 'X-Sender-Name: ' + message.sender_name + '\r\n'
msg += 'X-Delivery-Time: ' + str(message.delivery_time) + '\r\n'
msg += 'X-Creation-Time: ' + str(message.creation_time) + '\r\n'
msg += 'X-Client-Submit-Time: ' + str(message.client_submit_time) + '\r\n'
msg += 'X-Subject: ' + message.subject + '\r\n'
msg += 'X-Attachments: ' + str(message.number_of_attachments) + '\r\n'
btype, body = get_body(message)
msg += 'X-Body-Type: ' + btype + '\r\n'
msg += '\r\n'
msg += body
return msg
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('PST_FILE', help="PST File Format from Microsoft Outlook")
parser.add_argument('OUTPUT_DIR', help="Directory of output for temporary and report files.")
parser.add_argument('--logfile', default=None, help='File path of log file.')
args = parser.parse_args()
output_directory = os.path.abspath(args.OUTPUT_DIR)
if not os.path.exists(output_directory):
os.makedirs(output_directory)
if args.logfile:
if not os.path.exists(args.logfile):
os.makedirs(args.logfile)
log_path = os.path.join(args.logfile, 'pst_indexer.log')
else:
log_path = None
logging.basicConfig(level=logging.DEBUG, filename=log_path,
format='%(asctime)s | %(levelname)s | %(message)s', filemode='w')
logging.info('Starting Script...')
pst_file = args.PST_FILE
try:
pff_file = pypff.file()
pff_file.open(pst_file)
process_folder(pff_file.root_folder, os.path.basename(pst_file))
except Exception as e:
raise
finally:
pff_file.close()
logging.info('Script Complete')
@thenktor
Copy link

thenktor commented Oct 23, 2019

Hi,
I'm getting this error:

  File "pst2eml.py", line 110, in <module>
    process_folder(pff_file.root_folder, os.path.basename(pst_file))
  File "pst2eml.py", line 27, in process_folder
    process_folder(sub_folder, folder_path)
  File "pst2eml.py", line 27, in process_folder
    process_folder(sub_folder, folder_path)
  File "pst2eml.py", line 20, in process_folder
    msg = process_message(message)
  File "pst2eml.py", line 75, in process_message
    btype, body = get_body(message)
  File "pst2eml.py", line 36, in get_body
    body = prep(msg.plain_text_body)
  File "pst2eml.py", line 34, in prep
    return b.strip()
AttributeError: 'NoneType' object has no attribute 'strip'

@TimRepke
Copy link
Author

Hi @thenktor, thanks for the hint. Apparently there is an email that can't be parsed or has no message text. A quick workaround would be to return an empty string. This way you at least get something, but it might not be complete due to an error in pypff.
I updated the script above.

@thenktor
Copy link

thenktor commented Oct 24, 2019

Hi @TimRepke, thanks for the answer. Now I have a lot of encoding problems: UnicodeDecodeError: 'utf8' codec can't decode byte 0xe4 in position 2164: invalid continuation byte
It turns out, that the mails in the PST file have different encodings. So far, this is my fix in the get_body function:

def get_body(msg):
    def prep(b):
        if type(b) == bytes:
            try:
                b = b.decode("utf8")
            except UnicodeDecodeError:
                logging.info('Is not utf8...')
            else:
                logging.info('Is utf8...')
                return b.strip() if b else None
            try:
                b = b.decode("cp1252")
            except UnicodeDecodeError:
                logging.info('Is not cp1252...')
                logging.error('Could not decode body!')
                return None
            else:
                logging.info('Is cp1252...')
                return b.strip() if b else None
        return b.strip() if b else None

    body = prep(msg.plain_text_body)
    if body:
        return 'plain-text', body

    try:
        body = prep(msg.html_body)
    except OSError:
        logging.error('A PyPff error!')
        return 'plain-text', ''

    if body:
        return 'html', body

    body = prep(msg.rtf_body)
    if body:
        return 'rtf', body

    return 'plain-text', ''

@thenktor
Copy link

thenktor commented Oct 24, 2019

Next error: Some mails are missing specific headers, e.g. an Exchange appointment invitation mail apparantly misses message.transport_headers. Or an e-mail draft was missing message.sender_name.
Also there are a lot of item, that are missing a message body (vCards and stuff).
Therefore I've changed the process_message function as follows:

def process_message(message):
    msg = ''
    keys = []
    if message.transport_headers:
        for hp in message.transport_headers.split('\n'):
            pts = re.findall(r'^([^:]+): (.+)\r$', hp)
            if pts:
                key = pts[0][0].capitalize()
                if key in keys:
                    key = 'X-' + key
                keys.append(key)
                val = pts[0][1]
    
                if key == 'Date':
                    val = ', '.join(val.split(',')[:2])
    
                msg += key + ': ' + val + '\r\n'
    if message.sender_name:
        msg += 'X-Sender-Name: ' + message.sender_name + '\r\n'
    if message.delivery_time:
        msg += 'X-Delivery-Time: ' + str(message.delivery_time) + '\r\n'
    if message.creation_time:
        msg += 'X-Creation-Time: ' + str(message.creation_time) + '\r\n'
    if message.client_submit_time:
        msg += 'X-Client-Submit-Time: ' + str(message.client_submit_time) + '\r\n'
    if message.subject:
        msg += 'X-Subject: ' + message.subject + '\r\n'
    if message.number_of_attachments:
        msg += 'X-Attachments: ' + str(message.number_of_attachments) + '\r\n'

    btype, body = get_body(message)
    msg += 'X-Body-Type: ' + btype + '\r\n'

    msg += '\r\n'
    if body:
        msg += body
    else:
        logging.debug('Missing message body!')
        return None
    
    return msg

In process_folder function I've changed as follows to discard non mail items without body:

        msg = process_message(message)
        if msg:
            fname = os.path.join(output_directory, safe_path + '_' + str(mi) + '.eml')
            logging.debug('  -- saving as {}'.format(fname))
            with open(fname, 'w') as f:
                f.write(msg)

@thenktor
Copy link

Now the script is working on my test.pst file, but the attachments are missing from the generated *.eml files.

@capntrips
Copy link

In case anyone else ends up here, see libyal/libpff#2 to check the status of python bindings for attachments.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment