Skip to content

Instantly share code, notes, and snippets.

@Mortal
Created April 24, 2017 19:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Mortal/858ec48bcb59d5861688d964c9d5a750 to your computer and use it in GitHub Desktop.
Save Mortal/858ec48bcb59d5861688d964c9d5a750 to your computer and use it in GitHub Desktop.
import re
import email
import email.header
import email.message
import email.utils
# import collections
# import pprint
with open('haengermails.mail', 'rb') as fp:
super_message = email.message_from_binary_file(fp)
def parse_header(v):
v = str(email.header.make_header(email.header.decode_header(v)))
try:
v2 = v.encode('latin1').decode('utf8')
except (UnicodeEncodeError, UnicodeDecodeError):
pass
else:
if v != v2:
print("Double-encoded %r" % (v2,))
return v2
return v
STRIP_HEADERS = r'''
^(Authentication-Results|Delivered-To|Received|Received-SPF|Return-Path|
X-Received|X-OriginalArrivalTime|X-Sieve|X-NFIT.*|X-Scanned-By|X-Sim|
X-SPF-Fail|X-Forefront-.*|X-SpamScore|X-BigFish|X-OriginatorOrg|
X-NilSimsa-Score|X-FOPE-CONNECTOR|X-FFO-Routing-Override)$
'''
def strip_headers(items):
return [x for x in items if not re.match(STRIP_HEADERS, x[0], re.X|re.I)]
# header_counts = collections.Counter()
assert super_message.get_payload(0).get_content_maintype() == 'text'
attachments = super_message.get_payload()[1:]
for attachment in attachments: # type: email.message.Message
assert attachment.get_content_type() == 'message/rfc822'
message = attachment.get_payload(0)
assert message.get_content_type() == 'text/plain'
body = message.get_payload()
receiveds = message.get_all('Received')
if receiveds is None:
print(message.items())
mo = re.search('from (userid \d+|\S+\w)', receiveds[-1])
assert mo.group(1) in ('userid 13', 'prodekanus.auitdrift.client.au.dk')
date = email.utils.parsedate_to_datetime(message['Date'])
from_name, from_email = email.utils.parseaddr(
parse_header(message['From']))
subject = parse_header(message['Subject'])
headers = strip_headers(message.items())
print(date, from_name, from_email, subject, len(body))
# header_counts += collections.Counter(k for k, v in headers)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment