Created
April 24, 2017 19:33
-
-
Save Mortal/858ec48bcb59d5861688d964c9d5a750 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import email | |
import email.header | |
import email.message | |
import email.utils | |
# import collections | |
# import pprint | |
with open('haengermails.mail', 'rb') as fp: | |
super_message = email.message_from_binary_file(fp) | |
def parse_header(v): | |
v = str(email.header.make_header(email.header.decode_header(v))) | |
try: | |
v2 = v.encode('latin1').decode('utf8') | |
except (UnicodeEncodeError, UnicodeDecodeError): | |
pass | |
else: | |
if v != v2: | |
print("Double-encoded %r" % (v2,)) | |
return v2 | |
return v | |
STRIP_HEADERS = r''' | |
^(Authentication-Results|Delivered-To|Received|Received-SPF|Return-Path| | |
X-Received|X-OriginalArrivalTime|X-Sieve|X-NFIT.*|X-Scanned-By|X-Sim| | |
X-SPF-Fail|X-Forefront-.*|X-SpamScore|X-BigFish|X-OriginatorOrg| | |
X-NilSimsa-Score|X-FOPE-CONNECTOR|X-FFO-Routing-Override)$ | |
''' | |
def strip_headers(items): | |
return [x for x in items if not re.match(STRIP_HEADERS, x[0], re.X|re.I)] | |
# header_counts = collections.Counter() | |
assert super_message.get_payload(0).get_content_maintype() == 'text' | |
attachments = super_message.get_payload()[1:] | |
for attachment in attachments: # type: email.message.Message | |
assert attachment.get_content_type() == 'message/rfc822' | |
message = attachment.get_payload(0) | |
assert message.get_content_type() == 'text/plain' | |
body = message.get_payload() | |
receiveds = message.get_all('Received') | |
if receiveds is None: | |
print(message.items()) | |
mo = re.search('from (userid \d+|\S+\w)', receiveds[-1]) | |
assert mo.group(1) in ('userid 13', 'prodekanus.auitdrift.client.au.dk') | |
date = email.utils.parsedate_to_datetime(message['Date']) | |
from_name, from_email = email.utils.parseaddr( | |
parse_header(message['From'])) | |
subject = parse_header(message['Subject']) | |
headers = strip_headers(message.items()) | |
print(date, from_name, from_email, subject, len(body)) | |
# header_counts += collections.Counter(k for k, v in headers) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment