Last active
August 29, 2015 14:07
-
-
Save pmanvi/14690c14cf3498da59c6 to your computer and use it in GitHub Desktop.
Exporting enron email to ES friendly JSONs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'pmanvi' | |
import os | |
import sys | |
from email.parser import Parser | |
from elasticsearch import Elasticsearch | |
from datetime import date | |
import traceback | |
p = Parser() | |
INDEX_NAME = 'enron-email_without_source' | |
es = Elasticsearch() | |
def create_index(): | |
""" | |
Cleans up the index if exists & then creates a fresh index. | |
""" | |
if es.indices.exists(INDEX_NAME): | |
es.indices.delete(INDEX_NAME) | |
print('DELETED EXISTING INDEX \'{}\' to create new one '.format(INDEX_NAME)) | |
result = es.indices.create(index=INDEX_NAME, ignore=400) | |
print(result) | |
putMapping() | |
return | |
def get_enron_eml_content(eml_file_to_open): | |
data_file = open(eml_file_to_open) | |
contents = "" | |
try: | |
for line in data_file: | |
contents += line | |
finally: | |
data_file.close() | |
return contents | |
def putMapping(): | |
es.indices.put_mapping( | |
index=INDEX_NAME, | |
doc_type='enron-type', | |
body= | |
{ | |
"dynamic": "strict", | |
"_source": {"enabled": "false"}, | |
"properties": | |
{ | |
"content-transfer-encoding": {"type": "string", "store": "false"}, | |
"message_body": {"type": "string", "store": "false"}, | |
"content-type": {"type": "string", "store": "false"}, | |
"x-bcc": {"type": "string", "store": "false"}, | |
"from": {"type": "string", "store": "false"}, | |
"x-from": {"type": "string", "store": "false"}, | |
"x-filename": {"type": "string", "store": "false"}, | |
"x-folder": {"type": "string", "store": "false"}, | |
"to": {"type": "string", "store": "true"}, | |
"x-to": {"type": "string", "store": "false"}, | |
"mime-version": {"type": "string", "store": "false"}, | |
"file_name": {"type": "string", "store": "false"}, | |
"cc": {"type": "string", "store": "false"}, | |
"x-cc": {"type": "string", "store": "false"}, | |
"bcc": {"type": "string", "store": "false"}, | |
"x-bcc": {"type": "string", "store": "false"}, | |
"subject": {"type": "string", "store": "false"}, | |
"message-id": {"type": "string", "store": "true"}, | |
"x-origin": {"type": "string", "store": "false"}, | |
"sub_holder": {"type": "string", "store": "false"}, | |
"date": {"type": "date", "format": "EEE, dd MMM yyyy HH:mm:ss Z (z)"}, | |
"attendees": {"type": "string", "store": "false"}, | |
"loaded_on": {"type": "date", "store": "false", "format": "dateOptionalTime"}, | |
"content_size_in_bytes": {"type": "long", "store": "true"} | |
} | |
} | |
) | |
def index_into_elasticsearch(nameOfFileToOpen,sub_folder, filename, contents): | |
msg = p.parsestr(contents) | |
jsonMapDoc = {} | |
#EML headers such as From, To etc... | |
headers = dict(msg._headers) | |
for key, value in headers.items(): | |
key = key.lower() | |
if not value.find(",") == -1 and key != "date" and key != "subject": | |
value = value.split(",") | |
jsonMapDoc[key] = value | |
else: | |
if key == "date" and value == "": | |
value = date.today() | |
jsonMapDoc[key] = value | |
pass | |
#jsonMapDoc[mailbox_owner] = mailbox_owner | |
jsonMapDoc["sub_holder"] = sub_folder | |
jsonMapDoc["file_name"] = filename | |
jsonMapDoc["loaded_on"] = date.today() | |
jsonMapDoc["message_body"] = msg._payload | |
file_size = os.path.getsize(nameOfFileToOpen) | |
jsonMapDoc["content_size_in_bytes"] = file_size | |
try: | |
es.index(index=INDEX_NAME, doc_type="enron-type", body=jsonMapDoc) | |
#print("Done indexing the doc") | |
except Exception as ex: | |
traceback.print_exc() | |
print("Failed to index the document {}".format(jsonMapDoc)) | |
return | |
import datetime | |
if __name__ == "__main__": | |
start = datetime.datetime.now() | |
if len(sys.argv) < 2: | |
raise Exception("dir name having email data is missing as argument") | |
else: | |
mail_dir = sys.argv[1] | |
if not os.path.isdir(mail_dir): | |
raise Exception("Invalid dir for Enron emails: %s" % mail_dir) | |
create_index() | |
prefix_size = len(mail_dir) + 1 | |
for root, dirs, files in os.walk(mail_dir, topdown="false"): | |
directory = root[prefix_size:] | |
# extract mail box owner | |
parts = directory.split('/', 1) | |
mailboxOwner = parts[0] | |
# sub-folder info | |
if 2 == len(parts): | |
subFolder = parts[1] | |
else: | |
subFolder = '' | |
# distinct file name | |
for filename in files: | |
# get the file contents | |
nameOfFileToOpen = "{0}/{1}".format(root, filename) | |
contents = get_enron_eml_content(nameOfFileToOpen) | |
index_into_elasticsearch(nameOfFileToOpen,subFolder, filename, contents) | |
end = datetime.datetime.now() | |
print('Completed in %d seconds ',(end-start)/1000) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment