nbhasker/YahooEmailToHTML.py

## YahooEmailToHTML.py
import os
import json
import datetime
import io

dir_name = r'C:\Users\Bhasker\Documents\mydocs\src\YahooEmailToHTML\EmailFull'
outfile_name = "output.html"
start_msg_no = 15000
num_msgs_per_file = 5000

msgs = []

for f_name in os.listdir(dir_name):
    if f_name.endswith(".json") and "_raw.json" not in f_name:
        print "Loading ", (os.path.join(dir_name, f_name))
    else:
        continue

    with open(os.path.join(dir_name, f_name)) as j:
        d = json.load(j)
        msgs.append(d)
        if "attachmentsInfo" in d:
            for a in d["attachmentsInfo"]:
                print "Attachment Filename: ", a["filename"]
        if len(d["authorName"]) == 0:
            print "Zero length authorName. Using: ", d["from"]

print "Found ", len(msgs), " messages"

outf = io.open(outfile_name, 'w', encoding='utf8')

print "Sorting ..."
sorted_msgs = sorted(msgs, key=lambda i: (i["topicId"], i["msgId"]))
print "Done"

msg_no = 0
msg_count = 0

for m in sorted_msgs:
    if msg_no < start_msg_no:
        msg_no += 1
        continue;

    if msg_count >= num_msgs_per_file:
        break;

    t = datetime.datetime.utcfromtimestamp(float(m["postDate"])).strftime('%B %#d, %Y %H:%M:%S (UTC)')
    if len(m["authorName"]) <> 0:
        s = u"<h1>" + m["authorName"] + u" : " + unicode(t, "utf-8") + u"</h1>" + u"\n"
    else:
        s = u"<h1>" + m["from"] + u" : " + unicode(t, "utf-8") + u"</h1>" + u"\n"
    print "Processing ", m["authorName"], " / ", m["from"]
    outf.write(s)

    if "subject" in m:
        s = u"<h2>" + m["subject"] + u"</h2>" + u"\n"
        outf.write(s)

    if "messageBody" in m:
        outf.write(m["messageBody"])

    if "attachmentsInfo" in m and len(m["attachmentsInfo"]) > 0:
        s = u"<h2>Attachments:</h2>\n"
        outf.write(s)
        s = u"\n" + u"<ul>" + u"\n"
        outf.write(s)
        for a in m["attachmentsInfo"]:
            s = u"<li>" + a["filename"] + u"</li>" + u"\n"
            outf.write(s)
        s = u"</ul>" + u"\n"
        outf.write(s)

    s = u"<hr><br></br>\n"
    outf.write(s)

    msg_count += 1

print "Start Msg Number: ", start_msg_no, "Processed Messages: ", msg_count
outf.close()
	import os
	import json
	import datetime
	import io

	dir_name = r'C:\Users\Bhasker\Documents\mydocs\src\YahooEmailToHTML\EmailFull'
	outfile_name = "output.html"
	start_msg_no = 15000
	num_msgs_per_file = 5000

	msgs = []

	for f_name in os.listdir(dir_name):
	if f_name.endswith(".json") and "_raw.json" not in f_name:
	print "Loading ", (os.path.join(dir_name, f_name))
	else:
	continue

	with open(os.path.join(dir_name, f_name)) as j:
	d = json.load(j)
	msgs.append(d)
	if "attachmentsInfo" in d:
	for a in d["attachmentsInfo"]:
	print "Attachment Filename: ", a["filename"]
	if len(d["authorName"]) == 0:
	print "Zero length authorName. Using: ", d["from"]

	print "Found ", len(msgs), " messages"

	outf = io.open(outfile_name, 'w', encoding='utf8')

	print "Sorting ..."
	sorted_msgs = sorted(msgs, key=lambda i: (i["topicId"], i["msgId"]))
	print "Done"

	msg_no = 0
	msg_count = 0

	for m in sorted_msgs:
	if msg_no < start_msg_no:
	msg_no += 1
	continue;

	if msg_count >= num_msgs_per_file:
	break;

	t = datetime.datetime.utcfromtimestamp(float(m["postDate"])).strftime('%B %#d, %Y %H:%M:%S (UTC)')
	if len(m["authorName"]) <> 0:
	s = u"<h1>" + m["authorName"] + u" : " + unicode(t, "utf-8") + u"</h1>" + u"\n"
	else:
	s = u"<h1>" + m["from"] + u" : " + unicode(t, "utf-8") + u"</h1>" + u"\n"
	print "Processing ", m["authorName"], " / ", m["from"]
	outf.write(s)

	if "subject" in m:
	s = u"<h2>" + m["subject"] + u"</h2>" + u"\n"
	outf.write(s)

	if "messageBody" in m:
	outf.write(m["messageBody"])

	if "attachmentsInfo" in m and len(m["attachmentsInfo"]) > 0:
	s = u"<h2>Attachments:</h2>\n"
	outf.write(s)
	s = u"\n" + u"<ul>" + u"\n"
	outf.write(s)
	for a in m["attachmentsInfo"]:
	s = u"<li>" + a["filename"] + u"</li>" + u"\n"
	outf.write(s)
	s = u"</ul>" + u"\n"
	outf.write(s)

	s = u"<hr><br></br>\n"
	outf.write(s)

	msg_count += 1

	print "Start Msg Number: ", start_msg_no, "Processed Messages: ", msg_count
	outf.close()