Skip to content

Instantly share code, notes, and snippets.

@ptwobrussell
Last active December 26, 2016 23:08
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ptwobrussell/8791064 to your computer and use it in GitHub Desktop.
Save ptwobrussell/8791064 to your computer and use it in GitHub Desktop.
A modification of MTSW2E Example 6-3 (http://bit.ly/1aWYgAv) with improvements toward getting the code to work seamlessly on mailboxes exported from Google Takeout.
"""
A modification of MTSW2E Example 6-3 (http://bit.ly/1aWYgAv) with the following modifications:
* Extra debugging information is written to sys.stderr to help isolate any problematic content
that may be encountered.
* A (hopeful) fix to a blasted UnicodeEncodeError in cleanContent() that may be triggered from
quopri.decodestring attempting to decode an already decoded Unicode value.
* The JSONification in jsonifyMessage now ignores any content that's not text. MIME-encoded content
such as images, PDFs, and other non-text data that is not useful for textual analysis without
significant additional work is now no longer carried forward into the JSON for import into MongoDB.
* This modified example can run as a standalone script and accepts the mbox file as a a command line
argument.
Example usage:
$ python thisScript.py yourMailboxFile.mbox
"""
import sys
import mailbox
import email
import quopri
import json
import time
from BeautifulSoup import BeautifulSoup
from dateutil.parser import parse
MBOX = sys.argv[1]
OUT_FILE = MBOX + '.json'
def cleanContent(msg):
# Decode message from "quoted printable" format, but first
# re-encode, since decodestring will try to do a decode of its own
msg = quopri.decodestring(msg.encode('utf-8'))
# Strip out HTML tags, if any are present.
# Bail on unknown encodings if errors happen in BeautifulSoup.
try:
soup = BeautifulSoup(msg)
except:
return ''
return ''.join(soup.findAll(text=True))
# There's a lot of data to process, and the Pythonic way to do it is with a
# generator. See http://wiki.python.org/moin/Generators.
# Using a generator requires a trivial encoder to be passed to json for object
# serialization.
class Encoder(json.JSONEncoder):
def default(self, o): return list(o)
# The generator itself...
def gen_json_msgs(mb):
msg_number = 0
while 1:
msg = mb.next()
if msg is None:
break
print >> sys.stderr, "Processing message number", msg_number
sys.stderr.flush()
yield jsonifyMessage(msg)
msg_number += 1
def jsonifyMessage(msg):
json_msg = {'parts': []}
for (k, v) in msg.items():
json_msg[k] = v.decode('utf-8', 'ignore')
# The To, Cc, and Bcc fields, if present, could have multiple items.
# Note that not all of these fields are necessarily defined.
for k in ['To', 'Cc', 'Bcc']:
if not json_msg.get(k):
continue
json_msg[k] = json_msg[k].replace('\n', '').replace('\t', '').replace('\r', '')\
.replace(' ', '').decode('utf-8', 'ignore').split(',')
for part in msg.walk():
json_part = {}
if part.get_content_maintype() != 'text':
print >> sys.stderr, "Skipping MIME content in JSONification ({0})".format(part.get_content_maintype())
continue
json_part['contentType'] = part.get_content_type()
content = part.get_payload(decode=False).decode('utf-8', 'ignore')
try:
json_part['content'] = cleanContent(content)
except:
print >> sys.stderr, "Caught Exception during cleanContent(). Displaying content..."
print >> sys.stderr, "-"*30
print >> sys.stderr, content
print >> sys.stderr, "-"*30
# XXX: Note that you could execute the following commented out line instead of re-raising to ignore this error
# json_part['content'] = u"ERROR PROCESSING THIS CONTENT"
# Comment out this line if you uncomment the one above
raise
json_msg['parts'].append(json_part)
# Finally, convert date from asctime to milliseconds since epoch using the
# $date descriptor so it imports "natively" as an ISODate object in MongoDB
then = parse(json_msg['Date'])
millis = int(time.mktime(then.timetuple())*1000 + then.microsecond/1000)
json_msg['Date'] = {'$date' : millis}
return json_msg
mbox = mailbox.UnixMailbox(open(MBOX, 'rb'), email.message_from_file)
# Write each message out as a JSON object on a separate line
# for easy import into MongoDB via mongoimport
f = open(OUT_FILE, 'w')
for msg in gen_json_msgs(mbox):
if msg != None:
f.write(json.dumps(msg, cls=Encoder) + '\n')
f.close()
print "All done"
@chemotkbilo
Copy link

chemotkbilo commented Dec 26, 2016

quick question, how to deal with multipart then?
if part.get_content_maintype() != 'text' and
part.get_content_maintype() !='image':

works fine, but not with multipart, I Want the multipart to also JSONify

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment