Skip to content

Instantly share code, notes, and snippets.

@pmanvi
Last active August 29, 2015 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pmanvi/14690c14cf3498da59c6 to your computer and use it in GitHub Desktop.
Save pmanvi/14690c14cf3498da59c6 to your computer and use it in GitHub Desktop.
Exporting enron email to ES friendly JSONs
__author__ = 'pmanvi'
import os
import sys
from email.parser import Parser
from elasticsearch import Elasticsearch
from datetime import date
import traceback
p = Parser()
INDEX_NAME = 'enron-email_without_source'
es = Elasticsearch()
def create_index():
"""
Cleans up the index if exists & then creates a fresh index.
"""
if es.indices.exists(INDEX_NAME):
es.indices.delete(INDEX_NAME)
print('DELETED EXISTING INDEX \'{}\' to create new one '.format(INDEX_NAME))
result = es.indices.create(index=INDEX_NAME, ignore=400)
print(result)
putMapping()
return
def get_enron_eml_content(eml_file_to_open):
data_file = open(eml_file_to_open)
contents = ""
try:
for line in data_file:
contents += line
finally:
data_file.close()
return contents
def putMapping():
es.indices.put_mapping(
index=INDEX_NAME,
doc_type='enron-type',
body=
{
"dynamic": "strict",
"_source": {"enabled": "false"},
"properties":
{
"content-transfer-encoding": {"type": "string", "store": "false"},
"message_body": {"type": "string", "store": "false"},
"content-type": {"type": "string", "store": "false"},
"x-bcc": {"type": "string", "store": "false"},
"from": {"type": "string", "store": "false"},
"x-from": {"type": "string", "store": "false"},
"x-filename": {"type": "string", "store": "false"},
"x-folder": {"type": "string", "store": "false"},
"to": {"type": "string", "store": "true"},
"x-to": {"type": "string", "store": "false"},
"mime-version": {"type": "string", "store": "false"},
"file_name": {"type": "string", "store": "false"},
"cc": {"type": "string", "store": "false"},
"x-cc": {"type": "string", "store": "false"},
"bcc": {"type": "string", "store": "false"},
"x-bcc": {"type": "string", "store": "false"},
"subject": {"type": "string", "store": "false"},
"message-id": {"type": "string", "store": "true"},
"x-origin": {"type": "string", "store": "false"},
"sub_holder": {"type": "string", "store": "false"},
"date": {"type": "date", "format": "EEE, dd MMM yyyy HH:mm:ss Z (z)"},
"attendees": {"type": "string", "store": "false"},
"loaded_on": {"type": "date", "store": "false", "format": "dateOptionalTime"},
"content_size_in_bytes": {"type": "long", "store": "true"}
}
}
)
def index_into_elasticsearch(nameOfFileToOpen,sub_folder, filename, contents):
msg = p.parsestr(contents)
jsonMapDoc = {}
#EML headers such as From, To etc...
headers = dict(msg._headers)
for key, value in headers.items():
key = key.lower()
if not value.find(",") == -1 and key != "date" and key != "subject":
value = value.split(",")
jsonMapDoc[key] = value
else:
if key == "date" and value == "":
value = date.today()
jsonMapDoc[key] = value
pass
#jsonMapDoc[mailbox_owner] = mailbox_owner
jsonMapDoc["sub_holder"] = sub_folder
jsonMapDoc["file_name"] = filename
jsonMapDoc["loaded_on"] = date.today()
jsonMapDoc["message_body"] = msg._payload
file_size = os.path.getsize(nameOfFileToOpen)
jsonMapDoc["content_size_in_bytes"] = file_size
try:
es.index(index=INDEX_NAME, doc_type="enron-type", body=jsonMapDoc)
#print("Done indexing the doc")
except Exception as ex:
traceback.print_exc()
print("Failed to index the document {}".format(jsonMapDoc))
return
import datetime
if __name__ == "__main__":
start = datetime.datetime.now()
if len(sys.argv) < 2:
raise Exception("dir name having email data is missing as argument")
else:
mail_dir = sys.argv[1]
if not os.path.isdir(mail_dir):
raise Exception("Invalid dir for Enron emails: %s" % mail_dir)
create_index()
prefix_size = len(mail_dir) + 1
for root, dirs, files in os.walk(mail_dir, topdown="false"):
directory = root[prefix_size:]
# extract mail box owner
parts = directory.split('/', 1)
mailboxOwner = parts[0]
# sub-folder info
if 2 == len(parts):
subFolder = parts[1]
else:
subFolder = ''
# distinct file name
for filename in files:
# get the file contents
nameOfFileToOpen = "{0}/{1}".format(root, filename)
contents = get_enron_eml_content(nameOfFileToOpen)
index_into_elasticsearch(nameOfFileToOpen,subFolder, filename, contents)
end = datetime.datetime.now()
print('Completed in %d seconds ',(end-start)/1000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment