Skip to content

Instantly share code, notes, and snippets.

@empirasign
Last active September 11, 2023 19:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save empirasign/8237e8e68243e9f2532e88cf17cb84ca to your computer and use it in GitHub Desktop.
Save empirasign/8237e8e68243e9f2532e88cf17cb84ca to your computer and use it in GitHub Desktop.
demo script to parse runs in Outlook Inbox
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
parse_outlook_inbox.py
https://gist.github.com/empirasign/8237e8e68243e9f2532e88cf17cb84ca
demo script to parse runs in Outlook client Inbox
if Outlook not already running, launch Outlook
locate new messages in INBOX / SOURCE_FOLDER
for each new message
send message to parser
save parsed results JSON blob in temp directory
move parsed message to COMPLETED_FOLDER
fin
Things this script does not do
- it does not save results to a local database, only to local JSON files
- other than timestamps and SOURCE_FOLDER / COMPLETED_FOLDER message placements,
it has no real sense of what needs to be parsed and what has already been parsed
THIS SCRIPT WORKS ONLY ON WINDOWS
Requirements
Python3
https://www.python.org/downloads/
Python for Win32 (pywin32) extensions
https://github.com/mhammond/pywin32
requests
https://docs.python-requests.org/en/latest/
"""
import datetime
import base64
import json
import logging
import logging.handlers
import os
from pathlib import Path
import sys
import tempfile
import time
from urllib.parse import quote
import requests
import win32com.client # pylint: disable=import-error
import win32ui # pylint: disable=import-error
# Configuration Constants
API_KEY = "MY_API_KEY" # provided by Empirasign
API_SECRET = "MY_API_SECRET" # provided by Empirasign
API_URL = 'https://api.empirasign.com/v1/parse-corp/' # Corporate runs parser
# for other parsing endpoints, see docs: https://www.empirasign.com/api-docs/
PROXY_SERVER = ""
# If you need to locate your proxy server, the easiest way to do this is by
# using our tool on the "proxy_finder" tab the of demo api spreadsheet,
# Click the "Find Proxy" button, results will show up in cell B4
# demo api spreadsheet can be found at:
# https://www.empirasign.com/api-mbs/ click on Excel icon to download
# the proxy host usually looks like proxy.mycompany.net:8080
# if you get a 407 Proxy Authentication Required error, you need to set
# PROXY_SERVER to something like username:password@proxy.mycompany.net:8080
SOURCE_FOLDER = "INBOX"
COMPLETED_FOLDER = "PARSED"
# this script maintains state via folder usage
# once an email is moved into COMPLETED_FOLDER, no further parsing attempts
# will be made
RESULTS_DIR = Path(tempfile.gettempdir())
TEMP_DIR = Path(tempfile.gettempdir())
rfh = logging.handlers.RotatingFileHandler(filename=TEMP_DIR / "parse_outlook_inbox.log",
maxBytes=5000000)
logging.basicConfig(level=logging.INFO,
format="%(asctime)s - %(levelname)-8s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
handlers=[rfh, logging.StreamHandler(sys.stdout)])
logger = logging.getLogger()
def _proxies_dict(proxy_server):
"""
return proxy dictionary as needed by requests library
if this helper function is not comprehensive enough for your use case, consult
http://docs.python-requests.org/en/latest/user/advanced/#proxies
"""
if proxy_server:
return {'https': 'http://' + PROXY_SERVER}
return {}
def get_target_mail_items(inbox, dt0, dt1=None):
"""
return list of MailItems uids for a date or date range, label and max_size
"""
if not dt1:
dt1 = datetime.date.today() + datetime.timedelta(1)
if dt1 < dt0:
dt0, dt1 = dt1, dt0
dt1 += datetime.timedelta(1) # this makes the search inlusive?
dt0_str = dt0.strftime(r'%Y-%m-%d %H:%M %p')
dt1_str = dt1.strftime(r'%Y-%m-%d %H:%M %p')
# https://documentation.help/Microsoft-Outlook-Visual-Basic-Reference/olmthRestrict.htm
filter_str = "[ReceivedTime] >= '{}' And [ReceivedTime] < '{}'".format(dt0_str, dt1_str)
logger.info("applying filter on inbox: %s", filter_str)
target_items = inbox.Restrict(filter_str)
logger.info('Detected %s target emails to parse', len(target_items))
return target_items
def _safe_create_folder(root_folder, subfolder):
"""
create subfolder under root_folder (aka INBOX) if it does not already exist
"""
subfolders = [folder.Name for folder in root_folder.Folders]
if subfolder not in subfolders:
logger.info("creating folder %s under INBOX", subfolder)
root_folder.Folders.Add(subfolder)
logger.info("%s already exists as subfolder inder INBOX", subfolder)
# reutrn the destination folder as an object
return root_folder.Folders[subfolder]
def msg_to_disk(msg):
"""
Save an Outlook Mailitem object as a .msg file to the temp directory
"""
fname = quote(msg.EntryID, safe="") + ".msg"
msg_path = str(TEMP_DIR / fname)
msg.SaveAs(msg_path)
return msg_path
def main():
"""
the main event
"""
t0 = time.time()
logger.info("parse_outlook_inbox.py starting")
# Check if Outlook is opened
try:
win32ui.FindWindow(None, "Microsoft Outlook")
except win32ui.error:
logger.warning("Outlook is not running, trying to start")
try:
os.startfile("outlook") # pylint: disable=no-member
except Exception: # pylint: disable=broad-except
logger.exception("Cannot find Outlook")
raise
outlook = win32com.client.Dispatch('outlook.application')
mapi = outlook.GetNamespace('MAPI')
# retrieve user's email address or exit if there are no accounts logged in
if mapi.Accounts.Count == 0:
logger.warning("Outlook is running but default user is not authenticated vs email server")
raise RuntimeError
inbox_folder = mapi.GetDefaultFolder(6)
dest_folder = _safe_create_folder(inbox_folder, COMPLETED_FOLDER)
# https://docs.microsoft.com/en-us/office/vba/api/outlook.oldefaultfolders
target_lst = get_target_mail_items(inbox_folder.Items, datetime.date.today())
if not target_lst:
logger.warning('No emails to parse')
done_lst = []
for msg in target_lst:
msg_path = msg_to_disk(msg)
post_data = {
"api_key": API_KEY,
"api_secret": API_SECRET,
"msg": base64.b64encode(open(msg_path, "rb").read()).decode("ascii")
}
resp = requests.post(API_URL, json=post_data, proxies=_proxies_dict(PROXY_SERVER),
timeout=3.5)
if resp.status_code == 200:
done_lst.append(msg.EntryID)
obj = resp.json()
# compute a filename that's safe on Windows and Linux
# https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file
fname = quote(msg.EntryID, safe="") + ".json"
# reversible via unquote(fname)[:-5]
with open(RESULTS_DIR / fname, "wt") as fp:
fp.write(json.dumps(obj, sort_keys=True, indent=2, ensure_ascii=False))
logger.info("saved parsed results MailItem.EntryID: %s filename: %s", msg.EntryID,
fname)
msg.Move(dest_folder)
# check remaininq quota
if obj["meta"]["api_req_left"] < 1:
logger.warning("early exit of loop, daily quota exhausted, processed %s / %s",
len(done_lst), len(target_lst))
break
else:
fname = quote(msg.EntryID, safe="") + "-error.json"
with open(RESULTS_DIR / fname, "wt") as fp:
fp.write(resp.text)
logger.error("error occurred, http status code: %s, error file: %s", resp.status_code,
fname)
tot_runtime = round(time.time() - t0)
logger.info("finished total run time secs: %s, target_lst: %s processed_lst: %s ", tot_runtime,
target_lst, len(done_lst))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment