Skip to content

Instantly share code, notes, and snippets.

@DonBattery
Last active June 19, 2021 20:51
Show Gist options
  • Save DonBattery/870f7d8c408b07ffe264f59bb8991a0f to your computer and use it in GitHub Desktop.
Save DonBattery/870f7d8c408b07ffe264f59bb8991a0f to your computer and use it in GitHub Desktop.
Extract info and attachments from a raw .eml Email file
# Based on https://stackoverflow.com/a/31392574/14711510
from pathlib import Path
import email
import email.utils as email_utils
import json
class EmailInfo:
def __init__(self, email_file: str) -> None:
email_file_path = Path(email_file)
self.__email_files_dir = email_file_path.parent / 'attachments'
if not self.__email_files_dir.exists():
self.__email_files_dir.mkdir()
with open(email_file_path, 'r') as email_file:
email_message = email.message_from_file(email_file)
_, self.from_address = email_utils.parseaddr(email_message['from'])
_, self.to_address = email_utils.parseaddr(email_message['to'])
self.subject = email_message['subject'].strip()
self.date = email_utils.parsedate_to_datetime(email_message['date'])
self.text, self.html, self.files = self.__extract(email_message)
def __extract (self, message) -> tuple:
'''Extracts content from an e-mail message. This works for multipart and nested multipart messages too.
message -- email.Message() or mailbox.Message()
Returns tuple(text, html, files)
text -- All text from all parts.
html -- All HTMLs from all parts.
files -- Dictionary mapping extracted attachment files.
'''
text = ''
html = ''
files = {}
if not message.is_multipart():
if message.get_filename(): # It's an attachment
file_name = message.get_filename()
files[file_name] = self.__email_files_dir / file_name
if files[file_name].exists():
return text, html, files
files[file_name].write_bytes(message.get_payload(decode = True))
return text, html, files
# Not an attachment!
# See where this belongs. Text, Html or some other data:
content_type = message.get_content_type()
if content_type == 'text/plain':
text += message.get_payload(decode = True).decode('utf-8')
elif content_type == 'text/html':
html += message.get_payload(decode = True).decode('utf-8')
return text, html, files
# This IS a multipart message.
# So, we iterate over it and call __extract() recursively for each part.
y = 0
while 1:
# If we cannot get the payload, it means we hit the end:
try:
payload = message.get_payload(y)
except:
break
# payload is a new Message object which goes back to __extract
t, h, f = self.__extract(payload)
text += t
html += h
files.update(f)
y += 1
return text, html, files
def __repr__(self) -> str:
return json.dumps({
'subject': self.subject,
'from': self.from_address,
'to': self.to_address,
'date': self.date,
'text': self.text,
'html': self.html,
'files': self.files,
}, indent=4, sort_keys=True, default=str)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment