Skip to content

Instantly share code, notes, and snippets.

@kadin2048
Last active July 6, 2016 20:55
Show Gist options
  • Save kadin2048/6d77ab7471590eedcc65 to your computer and use it in GitHub Desktop.
Save kadin2048/6d77ab7471590eedcc65 to your computer and use it in GitHub Desktop.
Convert IBM Sametime log directory to .eml files
#!/usr/bin/env python
# Take an IBM Sametime HTML log file, determine the date, and create
# an RFC-compliant email message from it, for importation into an MUA
#
# Usage: sametimetoeml.py inputfile.html
# Where inputfile.html is a Sametime log located in a dated folder
# (See readme for more useful suggestions.)
#
# Written for Python 2.6
import sys
import os
import dateutil.parser
import time
from email.MIMEMultipart import MIMEMultipart
from email.MIMEText import MIMEText
import xml.etree.ElementTree
# Program-wide variables
global debug
debug = False # Debug toggle
# Class definition
class Chatlog:
"""Chatlog class, for holding chat log and metadata during conversion."""
def __init__(self, filepath, logtype):
# Instance
self.filepath = filepath
self.logtype = logtype
def setHTML(self):
if (self.logtype == "html"):
try:
infile = open(self.filepath,'r')
except:
raise # raise exception if we can't read file
self.html = infile.read()
# Then parse it using ElementTree
self.tree = xml.etree.ElementTree.fromstring(self.html)
def setBuddyName(self):
# remote buddy's name should always be the grandparent folder
self.buddyname = self.filepath.split(os.sep)[-3]
def setMetatagdata(self):
# <meta name="sametime:lastActivityTime" content="20070112-131123 (-0500)"/>
# This is for Python 2.6
for element in self.tree.getiterator(tag="meta"):
try:
if element.attrib['http-equiv'] == 'Content-Type':
self.contenttype = element.attrib['content']
if debug:
sys.stdout.write("HTTP Content-Type is: " + self.contenttype + "\n")
except KeyError:
pass # ignore KeyError
try:
if element.attrib['name'] == "sametime:creationTime":
self.datetimestr = element.attrib['content']
if debug:
sys.stdout.write("Creation time is " + self.datetimestr + "\n")
# This works as long as the time has seconds...
#self.isotime = self.datetimestr[:4] + '-' + self.datetimestr[4:6] + '-' + self.datetimestr[6:8] + 'T' + self.datetimestr[9:11] + ':' + self.datetimestr[11:13] + ':' + self.datetimestr[13:15] + self.datetimestr[17:20] + ':' + self.datetimestr[20:22]
#if debug:
# sys.stdout.write("ISO format datetime is: " + self.isotime + "\n")
self.datetime = dateutil.parser.parse(self.datetimestr, fuzzy=True)
self.isotime = self.datetime.isoformat()
if debug:
sys.stdout.write("Pretty date is: " + self.datetime.strftime("%a, %d %b %Y %H:%M:%S") + "\n")
except KeyError:
pass # ignore KeyError
try:
if element.attrib['name'] == 'sametime:username':
self.username = element.attrib['content']
if debug:
sys.stdout.write("Sametime username is " + self.username + "\n")
except KeyError:
pass # ignore KeyError
# Processing work
def main():
# First argument is the input file
infilename = sys.argv[1]
# Make sure infilename at least ends in .html before processing it
if infilename[-4:] != "html":
if debug:
sys.stderr.write("Input filename does not end with html\n")
return(1) #exit with error
if debug:
sys.stdout.write("Input filename: " + infilename + "\n")
# Then get the path
filepath = os.path.abspath(infilename) #filepath is a string
if debug:
sys.stdout.write("Input path is: " + str(filepath) + "\n")
sys.stdout.write("Directory path separator char is: " + str(os.sep) + "\n")
# instantiate Chatlog object
if debug:
sys.stdout.write("Instantiating chatlog object...\n")
chatlog = Chatlog(filepath, "html")
# read file contents into memory
if debug:
sys.stdout.write("Reading file contents...\n")
chatlog.setHTML()
# get the remote buddy's name from the path
if debug:
sys.stdout.write("Determining buddy name from path...\n")
chatlog.setBuddyName()
if debug:
sys.stdout.write("Buddy name is: " + str(chatlog.buddyname) + "\n")
# Parse the HTML and set other metadata values
if debug:
sys.stdout.write("Set meta tag values from HTML...\n")
chatlog.setMetatagdata()
# create message object for the output
msg_base = MIMEMultipart('mixed')
# set message headers
msg_base['Subject'] = "Sametime with " + chatlog.buddyname
msg_base['Date'] = chatlog.datetime.strftime("%a, %d %b %Y %H:%M:%S")
msg_base['From'] = chatlog.buddyname # TODO: set this to the chat originator
msg_base['To'] = chatlog.username # TODO: set this to username unless username == originator, in which case buddyname
#msg_base['X-Original-Filename'] = infilename
msg_base['X-Converted-On'] = time.strftime("%a, %d %b %Y %H:%M:%S") #timezones are hard...
if debug:
print "-- Headers after parsing first line are..."
for key, value in msg_base.items():
print key + ": " + value
# create message content
encoding = chatlog.contenttype.split(';')[1].split('=')[1] # get encoding (probably UTF-8) from HTML content-type header
content = MIMEText(chatlog.html, 'html', encoding)
msg_base.attach(content)
# Second arg, if present, is the output file
try:
outfilename = sys.argv[2]
except IndexError:
# default output is to cwd with same basename but .eml instead of .html
outfilename = os.getcwd() + os.sep + chatlog.buddyname + ' (' + chatlog.datetime.strftime("%Y-%m-%dT%H%M") + ').eml'
if debug:
sys.stdout.write("Output file is: " + outfilename + "\n")
fo = open(outfilename, 'w')
fo.write( msg_base.as_string() )
return 0
# If run as standalone, execute main loop
if __name__ == "__main__":
sys.exit( main() ) # program return value is main()'s return value

Usage Notes

This is meant to be run against the Sametime logs folder using a shell script.

Sample Script

find /path/to/SametimeLogs -name '*.html' -print0 | xargs -0 -n 1 python samtetimetoeml.py

Notes

Be sure to run: find /path/to/Sametime -name '*.html' | xargs ls first, to see what files you are going to end up operating on.

If files have been stored on a Mac, you may need to remove AppleDouble directories or you'll have a lot of spurious files. One way is by something like find . -name ".AppleDouble" -exec rm -Rf {} \; although there are many other methods.

TODO Items

  • Doesn't write an RFC compliant 'Message-ID' header like it should, which would be good to have for duplicate filtering. Code exists in the Adium conversion script that could be pasted in.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment