pmanvi/enron_email_to_es_exporter.py

## enron_email_to_es_exporter.py
__author__ = 'pmanvi'

import os
import sys
from email.parser import Parser
from elasticsearch import Elasticsearch
from datetime import date
import traceback

p = Parser()
INDEX_NAME = 'enron-email_without_source'
es = Elasticsearch()

def create_index():
    """
        Cleans up the index if exists & then creates a fresh index.
    """
    if es.indices.exists(INDEX_NAME):
        es.indices.delete(INDEX_NAME)
        print('DELETED EXISTING INDEX \'{}\' to create new one '.format(INDEX_NAME))
    result = es.indices.create(index=INDEX_NAME, ignore=400)
    print(result)
    putMapping()
    return


def get_enron_eml_content(eml_file_to_open):
    data_file = open(eml_file_to_open)
    contents = ""
    try:
        for line in data_file:
            contents += line
    finally:
        data_file.close()
    return contents


def putMapping():
    es.indices.put_mapping(
        index=INDEX_NAME,
        doc_type='enron-type',
        body=
        {
            "dynamic": "strict",
            "_source": {"enabled": "false"},
            "properties":
                {
                    "content-transfer-encoding": {"type": "string", "store": "false"},
                    "message_body": {"type": "string", "store": "false"},
                    "content-type": {"type": "string", "store": "false"},
                    "x-bcc": {"type": "string", "store": "false"},
                    "from": {"type": "string", "store": "false"},
                    "x-from": {"type": "string", "store": "false"},
                    "x-filename": {"type": "string", "store": "false"},
                    "x-folder": {"type": "string", "store": "false"},
                    "to": {"type": "string", "store": "true"},
                    "x-to": {"type": "string", "store": "false"},
                    "mime-version": {"type": "string", "store": "false"},
                    "file_name": {"type": "string", "store": "false"},
                    "cc": {"type": "string", "store": "false"},
                    "x-cc": {"type": "string", "store": "false"},
                    "bcc": {"type": "string", "store": "false"},
                    "x-bcc": {"type": "string", "store": "false"},
                    "subject": {"type": "string", "store": "false"},
                    "message-id": {"type": "string", "store": "true"},
                    "x-origin": {"type": "string", "store": "false"},
                    "sub_holder": {"type": "string", "store": "false"},
                    "date": {"type": "date", "format": "EEE, dd MMM yyyy HH:mm:ss Z (z)"},
                    "attendees": {"type": "string", "store": "false"},
                    "loaded_on": {"type": "date", "store": "false", "format": "dateOptionalTime"},
                    "content_size_in_bytes": {"type": "long", "store": "true"}

                }
        }
    )

def index_into_elasticsearch(nameOfFileToOpen,sub_folder, filename, contents):
    msg = p.parsestr(contents)
    jsonMapDoc = {}
    #EML headers such as From, To etc...
    headers = dict(msg._headers)
    for key, value in headers.items():
        key = key.lower()
        if not value.find(",") == -1 and key != "date" and key != "subject":
            value = value.split(",")
            jsonMapDoc[key] = value
        else:
            if key == "date" and value == "":
                value = date.today()
            jsonMapDoc[key] = value
    pass

    #jsonMapDoc[mailbox_owner] = mailbox_owner
    jsonMapDoc["sub_holder"] = sub_folder
    jsonMapDoc["file_name"] = filename
    jsonMapDoc["loaded_on"] = date.today()
    jsonMapDoc["message_body"] = msg._payload
    file_size  = os.path.getsize(nameOfFileToOpen)
    jsonMapDoc["content_size_in_bytes"] = file_size
    try:
        es.index(index=INDEX_NAME, doc_type="enron-type", body=jsonMapDoc)
        #print("Done indexing the doc")
    except Exception as ex:
        traceback.print_exc()
        print("Failed to index the document {}".format(jsonMapDoc))
    return

import datetime
if __name__ == "__main__":

    start = datetime.datetime.now()
    if len(sys.argv) < 2:
        raise Exception("dir name having email data is missing as argument")
    else:
        mail_dir = sys.argv[1]
        if not os.path.isdir(mail_dir):
            raise Exception("Invalid dir for Enron emails: %s" % mail_dir)

    create_index()
    prefix_size = len(mail_dir) + 1

    for root, dirs, files in os.walk(mail_dir, topdown="false"):
        directory = root[prefix_size:]

        # extract mail box owner
        parts = directory.split('/', 1)
        mailboxOwner = parts[0]

        # sub-folder info
        if 2 == len(parts):
            subFolder = parts[1]
        else:
            subFolder = ''

        # distinct file name
        for filename in files:
            # get the file contents
            nameOfFileToOpen = "{0}/{1}".format(root, filename)
            contents = get_enron_eml_content(nameOfFileToOpen)
            index_into_elasticsearch(nameOfFileToOpen,subFolder, filename, contents)

    end = datetime.datetime.now()
    print('Completed in %d seconds ',(end-start)/1000)
	__author__ = 'pmanvi'

	import os
	import sys
	from email.parser import Parser
	from elasticsearch import Elasticsearch
	from datetime import date
	import traceback

	p = Parser()
	INDEX_NAME = 'enron-email_without_source'
	es = Elasticsearch()

	def create_index():
	"""
	Cleans up the index if exists & then creates a fresh index.
	"""
	if es.indices.exists(INDEX_NAME):
	es.indices.delete(INDEX_NAME)
	print('DELETED EXISTING INDEX \'{}\' to create new one '.format(INDEX_NAME))
	result = es.indices.create(index=INDEX_NAME, ignore=400)
	print(result)
	putMapping()
	return


	def get_enron_eml_content(eml_file_to_open):
	data_file = open(eml_file_to_open)
	contents = ""
	try:
	for line in data_file:
	contents += line
	finally:
	data_file.close()
	return contents


	def putMapping():
	es.indices.put_mapping(
	index=INDEX_NAME,
	doc_type='enron-type',
	body=
	{
	"dynamic": "strict",
	"_source": {"enabled": "false"},
	"properties":
	{
	"content-transfer-encoding": {"type": "string", "store": "false"},
	"message_body": {"type": "string", "store": "false"},
	"content-type": {"type": "string", "store": "false"},
	"x-bcc": {"type": "string", "store": "false"},
	"from": {"type": "string", "store": "false"},
	"x-from": {"type": "string", "store": "false"},
	"x-filename": {"type": "string", "store": "false"},
	"x-folder": {"type": "string", "store": "false"},
	"to": {"type": "string", "store": "true"},
	"x-to": {"type": "string", "store": "false"},
	"mime-version": {"type": "string", "store": "false"},
	"file_name": {"type": "string", "store": "false"},
	"cc": {"type": "string", "store": "false"},
	"x-cc": {"type": "string", "store": "false"},
	"bcc": {"type": "string", "store": "false"},
	"x-bcc": {"type": "string", "store": "false"},
	"subject": {"type": "string", "store": "false"},
	"message-id": {"type": "string", "store": "true"},
	"x-origin": {"type": "string", "store": "false"},
	"sub_holder": {"type": "string", "store": "false"},
	"date": {"type": "date", "format": "EEE, dd MMM yyyy HH:mm:ss Z (z)"},
	"attendees": {"type": "string", "store": "false"},
	"loaded_on": {"type": "date", "store": "false", "format": "dateOptionalTime"},
	"content_size_in_bytes": {"type": "long", "store": "true"}

	}
	}
	)

	def index_into_elasticsearch(nameOfFileToOpen,sub_folder, filename, contents):
	msg = p.parsestr(contents)
	jsonMapDoc = {}
	#EML headers such as From, To etc...
	headers = dict(msg._headers)
	for key, value in headers.items():
	key = key.lower()
	if not value.find(",") == -1 and key != "date" and key != "subject":
	value = value.split(",")
	jsonMapDoc[key] = value
	else:
	if key == "date" and value == "":
	value = date.today()
	jsonMapDoc[key] = value
	pass

	#jsonMapDoc[mailbox_owner] = mailbox_owner
	jsonMapDoc["sub_holder"] = sub_folder
	jsonMapDoc["file_name"] = filename
	jsonMapDoc["loaded_on"] = date.today()
	jsonMapDoc["message_body"] = msg._payload
	file_size = os.path.getsize(nameOfFileToOpen)
	jsonMapDoc["content_size_in_bytes"] = file_size
	try:
	es.index(index=INDEX_NAME, doc_type="enron-type", body=jsonMapDoc)
	#print("Done indexing the doc")
	except Exception as ex:
	traceback.print_exc()
	print("Failed to index the document {}".format(jsonMapDoc))
	return

	import datetime
	if __name__ == "__main__":

	start = datetime.datetime.now()
	if len(sys.argv) < 2:
	raise Exception("dir name having email data is missing as argument")
	else:
	mail_dir = sys.argv[1]
	if not os.path.isdir(mail_dir):
	raise Exception("Invalid dir for Enron emails: %s" % mail_dir)

	create_index()
	prefix_size = len(mail_dir) + 1

	for root, dirs, files in os.walk(mail_dir, topdown="false"):
	directory = root[prefix_size:]

	# extract mail box owner
	parts = directory.split('/', 1)
	mailboxOwner = parts[0]

	# sub-folder info
	if 2 == len(parts):
	subFolder = parts[1]
	else:
	subFolder = ''

	# distinct file name
	for filename in files:
	# get the file contents
	nameOfFileToOpen = "{0}/{1}".format(root, filename)
	contents = get_enron_eml_content(nameOfFileToOpen)
	index_into_elasticsearch(nameOfFileToOpen,subFolder, filename, contents)

	end = datetime.datetime.now()
	print('Completed in %d seconds ',(end-start)/1000)