Skip to content

Instantly share code, notes, and snippets.

@jkmackie
Forked from benwattsjones/gmail_mbox_parser.py
Created July 21, 2023 17:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jkmackie/8fbfdf7bb2d9dc8c732a11505489efb8 to your computer and use it in GitHub Desktop.
Save jkmackie/8fbfdf7bb2d9dc8c732a11505489efb8 to your computer and use it in GitHub Desktop.
Quick python code to parse mbox files, specifically those used by GMail. Extracts sender, date, plain text contents etc., ignores base64 attachments.
#! /usr/bin/env python3
# ~*~ utf-8 ~*~
import mailbox
import bs4
def get_html_text(html):
try:
return bs4.BeautifulSoup(html, 'lxml').body.get_text(' ', strip=True)
except AttributeError: # message contents empty
return None
class GmailMboxMessage():
def __init__(self, email_data):
if not isinstance(email_data, mailbox.mboxMessage):
raise TypeError('Variable must be type mailbox.mboxMessage')
self.email_data = email_data
def parse_email(self):
email_labels = self.email_data['X-Gmail-Labels']
email_date = self.email_data['Date']
email_from = self.email_data['From']
email_to = self.email_data['To']
email_subject = self.email_data['Subject']
email_text = self.read_email_payload()
def read_email_payload(self):
email_payload = self.email_data.get_payload()
if self.email_data.is_multipart():
email_messages = list(self._get_email_messages(email_payload))
else:
email_messages = [email_payload]
return [self._read_email_text(msg) for msg in email_messages]
def _get_email_messages(self, email_payload):
for msg in email_payload:
if isinstance(msg, (list,tuple)):
for submsg in self._get_email_messages(msg):
yield submsg
elif msg.is_multipart():
for submsg in self._get_email_messages(msg.get_payload()):
yield submsg
else:
yield msg
def _read_email_text(self, msg):
content_type = 'NA' if isinstance(msg, str) else msg.get_content_type()
encoding = 'NA' if isinstance(msg, str) else msg.get('Content-Transfer-Encoding', 'NA')
if 'text/plain' in content_type and 'base64' not in encoding:
msg_text = msg.get_payload()
elif 'text/html' in content_type and 'base64' not in encoding:
msg_text = get_html_text(msg.get_payload())
elif content_type == 'NA':
msg_text = get_html_text(msg)
else:
msg_text = None
return (content_type, encoding, msg_text)
######################### End of library, example of use below
mbox_obj = mailbox.mbox('path/to/your-mbox-file-from-gmail.mbox')
num_entries = len(mbox_obj)
for idx, email_obj in enumerate(mbox_obj):
email_data = GmailMboxMessage(email_obj)
email_data.parse_email()
print('Parsing email {0} of {1}'.format(idx, num_entries))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment