Skip to content

Instantly share code, notes, and snippets.

@jflatow
Created August 19, 2013 22:20
Show Gist options
  • Save jflatow/6274886 to your computer and use it in GitHub Desktop.
Save jflatow/6274886 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""
Canonize emails (mostly by decoding), to make them extraction-friendly.
"""
import email
import hashlib
import re
import sys
def strp(str, pattern=re.compile('\s*(\n)\s*')):
return pattern.sub(r'\1', str)
def text(msg, transform=strp):
buf = ''
for item in msg.items():
buf += '%s: %s\r\n' % item
buf += '\r\n'
if msg.preamble:
buf += msg.preamble + '\r\n'
if msg.is_multipart():
bndy = msg.get_boundary()
buf += '--%s\r\n%s\r\n--%s--' % (bndy, ('\r\n--%s\r\n' % bndy).join(text(part) for part in msg.get_payload()), bndy)
else:
buf += transform(msg.get_payload(decode=True))
if msg.epilogue:
buf += msg.epilogue + '\r\n'
return buf
for arg in sys.argv[1:]:
txt = text(email.message_from_file(open(arg, 'rb')))
with open('%s.eml' % hashlib.sha1(txt).hexdigest(), 'wb') as handle:
handle.write(txt)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment