kynan/irssi_log_merge.py

## irssi_log_merge.py
#!/usr/bin/python
# .- coding: utf-8 -.
#
# irssi_log_merge.py.
# Written by Lasse Karstensen <lasse.karstensen@gmail.com>, 2008.
# Released under GPLv2.
#
# Newest version available on http://hyse.org/irssi-log-merge/ .
#
# Modified by Florian Rathgeber <florian.rathgeber@gmail.com>
# Latest version: https://gist.github.com/2706187

import os, sys, glob, shutil, codecs

def usage():
    print "irssi_log_merge.py by <lasse.karstensen@gmail.com>"
    print "Usage: %s [opts] todir dir1 dir2 <dir3...>" % sys.argv[0]
    print ""
    print "Sorts irssi log files chronologically from dir[123..] into todir."
    print "Directories should contain subdirectories like 'EFNet','IRCNet'..."
    print "Use -f to overwrite existing files in todir, if file size differ."
    print "Use -F to overwrite existing files in todir no matter what."
    print ""

class IrssiLogReader():
    import time, codecs
    def __init__(self):
        self.linebuf = ''
        self.sessionbuf = ''
        self.files = {}

    def addfile(self, filename):
        self.files [ filename ] = {
                                'fp': self.codecs.open(filename, 'r+', 'utf-8'),
                                'time': None,
                                'closed': False,
                                'buffer': u'' }

    def run(self, output):
        if len(self.files) == 0:
            raise Exception, 'No files to parse'

        pref = None
        while True:
            # find the earliest file
            if not pref:
                t1 = None
                for filename, fdict in self.files.items():
                    # if the file is empty, don't bother trying to read from
                    # it.
                    if fdict["closed"]:
                        #print "file %s is closed, and not eligiable for election" % filename
                        continue

                    l = fdict["fp"].readline()

                    fdict["buffer"] += l
                    if l.startswith('--- Log opened'):
                        datestring = " ".join(l.split()[3:])
                        try:
                            dt = self.parse_timestamp( datestring )
                        except ValueError, e:
                            # So. This is usually where the badness
                            # occurs. So. We handle it with grace, and
                            # dump the problematic log lines into a new
                            # file that can be read later.
                            #
                            # Easter egg!
                            print "ERROR: ValueError when parsing timestamp."
                            if "dump" in filename:
                                print "Seems to already be reading a dump file, not replacing dump"
                            else:
                                dumpfile = "fault/EFNet/dump"
                                import os.path
                                if not os.path.isdir(os.path.dirname(dumpfile)):
                                    print "Directory %s does not exist. Create it to get a dump of exception cause." % ( os.path.dirname(dumpfile))
                                    raise ValueError, e
                                ff = self.codecs.open(dumpfile, "w+", 'utf-8')
                                ff.write( l )
                                # cheat, to make it parseable
                                ff.write("--- Log closed\n")
                                ff.close()
                                print "Troublesome log dumped to file %s" % dumpfile
                            raise ValueError, e
                        fdict["time"] = dt
                        #print "file %s has datestring %s" % (filename, dt)

                # see if we're done.
                active = []
                for filename, fdict in self.files.items():
                    if not fdict["closed"]:
                        active.append(filename)
                if len(active) == 0:
                    #print "No more data to read"
                    return
                # loop through all files and find the one with the earliest
                # timestamp.
                for filename, fdict in self.files.items():
                    if fdict["closed"]:
                        continue

                    #print "File: %s\ttimestamp: %s" % (filename, fdict["time"])
                    if t1 == None:
                        t1 = fdict["time"]

                    if fdict["time"] <= t1:
                        t1 = fdict["time"]
                        pref = filename
                #print "Finished electing. file %s. ts=%s, pref: %s" % (filename, t1,  pref)

            #print "dumping block from file %s (ts: %s)" % (pref, self.files[pref]["time"])
            while True:
                # we may have cached data for this file, read while
                # seeking for the next Log started line. Dump this
                # first.
                if len(self.files[pref]["buffer"]) > 0:
                    l = self.files[pref]["buffer"]
                    self.files[pref]["buffer"] = u''
                else:
                    l = self.files[pref]["fp"].readline()
                    #print type(l), dump(l)
                    if len(l) == 0:
                        #print "end of file %s, forcing new election" % filename
                        self.files[pref]["closed"] = True
                        pref = None
                        break

                try:
                    output.write( l )
                except UnicodeEncodeError, e:
                    print type(l), dump(l)
                    print e
                    raise Exception

                if l.startswith('--- Log closed'):
                    #print "end of block, forcing new election"
                    pref = None
                    break

    def parse_timestamp(self, timestring):
        # from http://www.python.org/doc/2.5.2/lib/node745.html :
        # .. If, when coding a module for general use, you need a locale
        # independent version of an operation that is affected by the
        # locale (such as string.lower(), or certain formats used with
        # time.strftime()), you will have to find a way to do it without
        # using the standard library routine. Even better is convincing
        # yourself that using locale settings is okay. Only as a last
        # resort should you document that your module is not compatible
        # with non-"C" locale settings.
        #
        # So, here we go. :(
        # --- Log opened Tue Mar 28 21:17:38 2006
        # datetime(year, month, day[, hour[, minute[, second[, microsecond[,tzinfo]]]]])
        # en: Sun Oct 15 23:58:13 2006
        # >>> time.strftime("%a %b %d %H:%M:%S %Y", time.localtime())
        # 'Mon Nov 17 17:47:16 2008'
        # 'tor mar 29 13:16:47 2007' - norwegian day names used. replace
        # enough so that strptime can recognize it as english.
        #
        # TODO: We have the month, the year and the date. We don't need
        # to bother transforming and parsing the day name, as it is a
        # function of the mentioned three. Remove it some time.
        tdata = """
# notation hell. format: tovalue = from1,from2..
mon = man, ma.
tue = tir, ti.
wed = ons, on.
thu = tor, to.
fri = fre, fr.
# latin-1
#sat = lør
#sun = søn
#sat = l\xf8r, l\xf8.
#sun = s\xf8n, s\xf8.
sat = lør, lø.
sun = søn, sø.
#, s\xc3\xb8n
#
apr = april
may = mai
jun = juni, jun.
jul = juli, jul.
aug = aug.
sep = sep.
oct = oct., okt, okt.
nov = nov.
dec = des"""
        format  = '%a %b %d %H:%M:%S %Y'
        transforms = {}
        for t in tdata.split("\n"):
            if len(t) == 0:
                continue
            if t[0] == "#":
                continue

            (tovalue, keys) = t.split("=", 1)
            if not "," in keys:
                keys = [ keys ]
            else:
                keys = keys.split(",")

            for key in keys:
                key = unicode(key, 'utf-8')
                transforms[ key.strip() + " " ] = tovalue.strip() + " "
        s = None

        for fromkey, tovalue in transforms.items():
            #print fromkey, tovalue
            i = timestring.find(fromkey)
            if i > -1:
                #print "performing transform %s->%s" % (fromkey, tovalue)
                timestring = timestring.replace(fromkey, tovalue)

        s = self.time.strptime(timestring, format)
        return s

# following three procedures are stolen from
# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/142812 Jack
# Trainor 2008
""" dump any string to formatted hex output """
def dump(s):
    import types
    if type(s) == types.StringType:
        return dumpString(s)
    elif type(s) == types.UnicodeType:
        return dumpUnicodeString(s)

FILTER = ''.join([(len(repr(chr(x))) == 3) and chr(x) or '.' for x in range(256)])

""" dump any string, ascii or encoded, to formatted hex output """
def dumpString(src, length=16):
    result = []
    for i in xrange(0, len(src), length):
       chars = src[i:i+length]
       hex = ' '.join(["%02x" % ord(x) for x in chars])
       printable = ''.join(["%s" % ((ord(x) <= 127 and FILTER[ord(x)]) or '.') for x in chars])
       result.append("%04x  %-*s  %s\n" % (i, length*3, hex, printable))
    return ''.join(result)

""" dump unicode string to formatted hex output """
def dumpUnicodeString(src, length=8):
    result = []
    for i in xrange(0, len(src), length):
       unichars = src[i:i+length]
       hex = ' '.join(["%04x" % ord(x) for x in unichars])
       printable = ''.join(["%s" % ((ord(x) <= 127 and FILTER[ord(x)]) or '.') for x in unichars])
       result.append("%04x  %-*s  %s\n" % (i*2, length*5, hex, printable))
    return ''.join(result)

def find_networks(paths):
    known_networks = {}
    for dir in paths:
        for ent in glob.glob(dir + "/*"):
            if not os.path.isdir(ent):
                continue
            networkname = os.path.basename(ent)
            if not known_networks.has_key(networkname):
                known_networks[ networkname ] = []
            known_networks[ networkname ].append( ent )
    return known_networks

def main():
    if len(sys.argv) < 3:
        usage()
        sys.exit(255)

    # not very nice, but using getopt is a bit overkill.
    force = False
    forcefull = False
    if "-f" in sys.argv:
        force = True
        print "Replacing files in todir if size is incorrect"
        sys.argv.pop( sys.argv.index("-f") )

    if "-F" in sys.argv:
        forcefull = True
        force = True
        print "Overwriting old files in todir unconditionally"
        sys.argv.pop( sys.argv.index("-F") )

    todir = sys.argv[1]
    dirs = sys.argv[2:]

    if not os.path.isdir(todir):
        os.makedirs(todir)

    known_networks = find_networks(dirs)
    sources = {}

    for network, paths in known_networks.items():
        # TODO: irssi sometimes create a second network instance,
        # after wrong use of /connect and such. Perhaps one day one
        # should try to merge these with the original.
        #if network.endswith("2"):
        #    continue

        #print "finding available sources in network %s\t" % network,
        for path in paths:
            for file in glob.glob(path + '/*'):
                if not os.path.isfile(file):
                    continue
                (network, id) = file.split(os.sep)[-2:]
                fullid = network + "/" + id
                if not sources.has_key(fullid):
                    sources[fullid] = []
                sources[fullid] += [ file ]

    for source, files in sources.items():
        #print "source: %s" % source
        tofile = os.path.join(todir, source)
        if not os.path.isdir(os.path.dirname(tofile)):
            os.makedirs(os.path.dirname(tofile))

        if os.path.isfile(tofile) and not forcefull:
            srcsize = 0
            for srcfile in files:
                srcsize += os.stat(srcfile)[5]
            if os.stat(tofile)[5] == srcsize:
                print "Existing file %s has size equal to size of sources. Assuming file is already sorted. (-F to override)" % tofile
                continue

            if not force:
                print "File %s already exist. Not overwriting. Do you need -f?" % tofile
                continue

        if len(files) == 1:
            print "File %s only exist in one source, copying.." % source
            shutil.copy(files[0], tofile)
            continue

        print "Merging source %s with files: %s" % (source, " ".join(files))

        fp = codecs.open(tofile, 'w+', 'utf-8')

        lr = IrssiLogReader()
        for file in files:
            lr.addfile(file)
        lr.run(output=fp)
        fp.close()
    print "All files successfully merged, normal exit"

if __name__ == "__main__":
    main()
	#!/usr/bin/python
	# .- coding: utf-8 -.
	#
	# irssi_log_merge.py.
	# Written by Lasse Karstensen <lasse.karstensen@gmail.com>, 2008.
	# Released under GPLv2.
	#
	# Newest version available on http://hyse.org/irssi-log-merge/ .
	#
	# Modified by Florian Rathgeber <florian.rathgeber@gmail.com>
	# Latest version: https://gist.github.com/2706187

	import os, sys, glob, shutil, codecs

	def usage():
	print "irssi_log_merge.py by <lasse.karstensen@gmail.com>"
	print "Usage: %s [opts] todir dir1 dir2 <dir3...>" % sys.argv[0]
	print ""
	print "Sorts irssi log files chronologically from dir[123..] into todir."
	print "Directories should contain subdirectories like 'EFNet','IRCNet'..."
	print "Use -f to overwrite existing files in todir, if file size differ."
	print "Use -F to overwrite existing files in todir no matter what."
	print ""

	class IrssiLogReader():
	import time, codecs
	def __init__(self):
	self.linebuf = ''
	self.sessionbuf = ''
	self.files = {}

	def addfile(self, filename):
	self.files [ filename ] = {
	'fp': self.codecs.open(filename, 'r+', 'utf-8'),
	'time': None,
	'closed': False,
	'buffer': u'' }

	def run(self, output):
	if len(self.files) == 0:
	raise Exception, 'No files to parse'

	pref = None
	while True:
	# find the earliest file
	if not pref:
	t1 = None
	for filename, fdict in self.files.items():
	# if the file is empty, don't bother trying to read from
	# it.
	if fdict["closed"]:
	#print "file %s is closed, and not eligiable for election" % filename
	continue

	l = fdict["fp"].readline()

	fdict["buffer"] += l
	if l.startswith('--- Log opened'):
	datestring = " ".join(l.split()[3:])
	try:
	dt = self.parse_timestamp( datestring )
	except ValueError, e:
	# So. This is usually where the badness
	# occurs. So. We handle it with grace, and
	# dump the problematic log lines into a new
	# file that can be read later.
	#
	# Easter egg!
	print "ERROR: ValueError when parsing timestamp."
	if "dump" in filename:
	print "Seems to already be reading a dump file, not replacing dump"
	else:
	dumpfile = "fault/EFNet/dump"
	import os.path
	if not os.path.isdir(os.path.dirname(dumpfile)):
	print "Directory %s does not exist. Create it to get a dump of exception cause." % ( os.path.dirname(dumpfile))
	raise ValueError, e
	ff = self.codecs.open(dumpfile, "w+", 'utf-8')
	ff.write( l )
	# cheat, to make it parseable
	ff.write("--- Log closed\n")
	ff.close()
	print "Troublesome log dumped to file %s" % dumpfile
	raise ValueError, e
	fdict["time"] = dt
	#print "file %s has datestring %s" % (filename, dt)

	# see if we're done.
	active = []
	for filename, fdict in self.files.items():
	if not fdict["closed"]:
	active.append(filename)
	if len(active) == 0:
	#print "No more data to read"
	return
	# loop through all files and find the one with the earliest
	# timestamp.
	for filename, fdict in self.files.items():
	if fdict["closed"]:
	continue

	#print "File: %s\ttimestamp: %s" % (filename, fdict["time"])
	if t1 == None:
	t1 = fdict["time"]

	if fdict["time"] <= t1:
	t1 = fdict["time"]
	pref = filename
	#print "Finished electing. file %s. ts=%s, pref: %s" % (filename, t1, pref)

	#print "dumping block from file %s (ts: %s)" % (pref, self.files[pref]["time"])
	while True:
	# we may have cached data for this file, read while
	# seeking for the next Log started line. Dump this
	# first.
	if len(self.files[pref]["buffer"]) > 0:
	l = self.files[pref]["buffer"]
	self.files[pref]["buffer"] = u''
	else:
	l = self.files[pref]["fp"].readline()
	#print type(l), dump(l)
	if len(l) == 0:
	#print "end of file %s, forcing new election" % filename
	self.files[pref]["closed"] = True
	pref = None
	break

	try:
	output.write( l )
	except UnicodeEncodeError, e:
	print type(l), dump(l)
	print e
	raise Exception

	if l.startswith('--- Log closed'):
	#print "end of block, forcing new election"
	pref = None
	break

	def parse_timestamp(self, timestring):
	# from http://www.python.org/doc/2.5.2/lib/node745.html :
	# .. If, when coding a module for general use, you need a locale
	# independent version of an operation that is affected by the
	# locale (such as string.lower(), or certain formats used with
	# time.strftime()), you will have to find a way to do it without
	# using the standard library routine. Even better is convincing
	# yourself that using locale settings is okay. Only as a last
	# resort should you document that your module is not compatible
	# with non-"C" locale settings.
	#
	# So, here we go. :(
	# --- Log opened Tue Mar 28 21:17:38 2006
	# datetime(year, month, day[, hour[, minute[, second[, microsecond[,tzinfo]]]]])
	# en: Sun Oct 15 23:58:13 2006
	# >>> time.strftime("%a %b %d %H:%M:%S %Y", time.localtime())
	# 'Mon Nov 17 17:47:16 2008'
	# 'tor mar 29 13:16:47 2007' - norwegian day names used. replace
	# enough so that strptime can recognize it as english.
	#
	# TODO: We have the month, the year and the date. We don't need
	# to bother transforming and parsing the day name, as it is a
	# function of the mentioned three. Remove it some time.
	tdata = """
	# notation hell. format: tovalue = from1,from2..
	mon = man, ma.
	tue = tir, ti.
	wed = ons, on.
	thu = tor, to.
	fri = fre, fr.
	# latin-1
	#sat = lør
	#sun = søn
	#sat = l\xf8r, l\xf8.
	#sun = s\xf8n, s\xf8.
	sat = lør, lø.
	sun = søn, sø.
	#, s\xc3\xb8n
	#
	apr = april
	may = mai
	jun = juni, jun.
	jul = juli, jul.
	aug = aug.
	sep = sep.
	oct = oct., okt, okt.
	nov = nov.
	dec = des"""
	format = '%a %b %d %H:%M:%S %Y'
	transforms = {}
	for t in tdata.split("\n"):
	if len(t) == 0:
	continue
	if t[0] == "#":
	continue

	(tovalue, keys) = t.split("=", 1)
	if not "," in keys:
	keys = [ keys ]
	else:
	keys = keys.split(",")

	for key in keys:
	key = unicode(key, 'utf-8')
	transforms[ key.strip() + " " ] = tovalue.strip() + " "
	s = None

	for fromkey, tovalue in transforms.items():
	#print fromkey, tovalue
	i = timestring.find(fromkey)
	if i > -1:
	#print "performing transform %s->%s" % (fromkey, tovalue)
	timestring = timestring.replace(fromkey, tovalue)

	s = self.time.strptime(timestring, format)
	return s

	# following three procedures are stolen from
	# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/142812 Jack
	# Trainor 2008
	""" dump any string to formatted hex output """
	def dump(s):
	import types
	if type(s) == types.StringType:
	return dumpString(s)
	elif type(s) == types.UnicodeType:
	return dumpUnicodeString(s)

	FILTER = ''.join([(len(repr(chr(x))) == 3) and chr(x) or '.' for x in range(256)])

	""" dump any string, ascii or encoded, to formatted hex output """
	def dumpString(src, length=16):
	result = []
	for i in xrange(0, len(src), length):
	chars = src[i:i+length]
	hex = ' '.join(["%02x" % ord(x) for x in chars])
	printable = ''.join(["%s" % ((ord(x) <= 127 and FILTER[ord(x)]) or '.') for x in chars])
	result.append("%04x %-s %s\n" % (i, length3, hex, printable))
	return ''.join(result)

	""" dump unicode string to formatted hex output """
	def dumpUnicodeString(src, length=8):
	result = []
	for i in xrange(0, len(src), length):
	unichars = src[i:i+length]
	hex = ' '.join(["%04x" % ord(x) for x in unichars])
	printable = ''.join(["%s" % ((ord(x) <= 127 and FILTER[ord(x)]) or '.') for x in unichars])
	result.append("%04x %-s %s\n" % (i2, length*5, hex, printable))
	return ''.join(result)

	def find_networks(paths):
	known_networks = {}
	for dir in paths:
	for ent in glob.glob(dir + "/*"):
	if not os.path.isdir(ent):
	continue
	networkname = os.path.basename(ent)
	if not known_networks.has_key(networkname):
	known_networks[ networkname ] = []
	known_networks[ networkname ].append( ent )
	return known_networks

	def main():
	if len(sys.argv) < 3:
	usage()
	sys.exit(255)

	# not very nice, but using getopt is a bit overkill.
	force = False
	forcefull = False
	if "-f" in sys.argv:
	force = True
	print "Replacing files in todir if size is incorrect"
	sys.argv.pop( sys.argv.index("-f") )

	if "-F" in sys.argv:
	forcefull = True
	force = True
	print "Overwriting old files in todir unconditionally"
	sys.argv.pop( sys.argv.index("-F") )

	todir = sys.argv[1]
	dirs = sys.argv[2:]

	if not os.path.isdir(todir):
	os.makedirs(todir)

	known_networks = find_networks(dirs)
	sources = {}

	for network, paths in known_networks.items():
	# TODO: irssi sometimes create a second network instance,
	# after wrong use of /connect and such. Perhaps one day one
	# should try to merge these with the original.
	#if network.endswith("2"):
	# continue

	#print "finding available sources in network %s\t" % network,
	for path in paths:
	for file in glob.glob(path + '/*'):
	if not os.path.isfile(file):
	continue
	(network, id) = file.split(os.sep)[-2:]
	fullid = network + "/" + id
	if not sources.has_key(fullid):
	sources[fullid] = []
	sources[fullid] += [ file ]

	for source, files in sources.items():
	#print "source: %s" % source
	tofile = os.path.join(todir, source)
	if not os.path.isdir(os.path.dirname(tofile)):
	os.makedirs(os.path.dirname(tofile))

	if os.path.isfile(tofile) and not forcefull:
	srcsize = 0
	for srcfile in files:
	srcsize += os.stat(srcfile)[5]
	if os.stat(tofile)[5] == srcsize:
	print "Existing file %s has size equal to size of sources. Assuming file is already sorted. (-F to override)" % tofile
	continue

	if not force:
	print "File %s already exist. Not overwriting. Do you need -f?" % tofile
	continue

	if len(files) == 1:
	print "File %s only exist in one source, copying.." % source
	shutil.copy(files[0], tofile)
	continue

	print "Merging source %s with files: %s" % (source, " ".join(files))

	fp = codecs.open(tofile, 'w+', 'utf-8')

	lr = IrssiLogReader()
	for file in files:
	lr.addfile(file)
	lr.run(output=fp)
	fp.close()
	print "All files successfully merged, normal exit"

	if __name__ == "__main__":
	main()