Created
May 16, 2012 00:13
-
-
Save kynan/2706187 to your computer and use it in GitHub Desktop.
Irssi Log Merger is a small Python script that takes a set of Irssi irclogs/ directories and merges them chronologically into a single set of files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# .- coding: utf-8 -. | |
# | |
# irssi_log_merge.py. | |
# Written by Lasse Karstensen <lasse.karstensen@gmail.com>, 2008. | |
# Released under GPLv2. | |
# | |
# Newest version available on http://hyse.org/irssi-log-merge/ . | |
# | |
# Modified by Florian Rathgeber <florian.rathgeber@gmail.com> | |
# Latest version: https://gist.github.com/2706187 | |
import os, sys, glob, shutil, codecs | |
def usage(): | |
print "irssi_log_merge.py by <lasse.karstensen@gmail.com>" | |
print "Usage: %s [opts] todir dir1 dir2 <dir3...>" % sys.argv[0] | |
print "" | |
print "Sorts irssi log files chronologically from dir[123..] into todir." | |
print "Directories should contain subdirectories like 'EFNet','IRCNet'..." | |
print "Use -f to overwrite existing files in todir, if file size differ." | |
print "Use -F to overwrite existing files in todir no matter what." | |
print "" | |
class IrssiLogReader(): | |
import time, codecs | |
def __init__(self): | |
self.linebuf = '' | |
self.sessionbuf = '' | |
self.files = {} | |
def addfile(self, filename): | |
self.files [ filename ] = { | |
'fp': self.codecs.open(filename, 'r+', 'utf-8'), | |
'time': None, | |
'closed': False, | |
'buffer': u'' } | |
def run(self, output): | |
if len(self.files) == 0: | |
raise Exception, 'No files to parse' | |
pref = None | |
while True: | |
# find the earliest file | |
if not pref: | |
t1 = None | |
for filename, fdict in self.files.items(): | |
# if the file is empty, don't bother trying to read from | |
# it. | |
if fdict["closed"]: | |
#print "file %s is closed, and not eligiable for election" % filename | |
continue | |
l = fdict["fp"].readline() | |
fdict["buffer"] += l | |
if l.startswith('--- Log opened'): | |
datestring = " ".join(l.split()[3:]) | |
try: | |
dt = self.parse_timestamp( datestring ) | |
except ValueError, e: | |
# So. This is usually where the badness | |
# occurs. So. We handle it with grace, and | |
# dump the problematic log lines into a new | |
# file that can be read later. | |
# | |
# Easter egg! | |
print "ERROR: ValueError when parsing timestamp." | |
if "dump" in filename: | |
print "Seems to already be reading a dump file, not replacing dump" | |
else: | |
dumpfile = "fault/EFNet/dump" | |
import os.path | |
if not os.path.isdir(os.path.dirname(dumpfile)): | |
print "Directory %s does not exist. Create it to get a dump of exception cause." % ( os.path.dirname(dumpfile)) | |
raise ValueError, e | |
ff = self.codecs.open(dumpfile, "w+", 'utf-8') | |
ff.write( l ) | |
# cheat, to make it parseable | |
ff.write("--- Log closed\n") | |
ff.close() | |
print "Troublesome log dumped to file %s" % dumpfile | |
raise ValueError, e | |
fdict["time"] = dt | |
#print "file %s has datestring %s" % (filename, dt) | |
# see if we're done. | |
active = [] | |
for filename, fdict in self.files.items(): | |
if not fdict["closed"]: | |
active.append(filename) | |
if len(active) == 0: | |
#print "No more data to read" | |
return | |
# loop through all files and find the one with the earliest | |
# timestamp. | |
for filename, fdict in self.files.items(): | |
if fdict["closed"]: | |
continue | |
#print "File: %s\ttimestamp: %s" % (filename, fdict["time"]) | |
if t1 == None: | |
t1 = fdict["time"] | |
if fdict["time"] <= t1: | |
t1 = fdict["time"] | |
pref = filename | |
#print "Finished electing. file %s. ts=%s, pref: %s" % (filename, t1, pref) | |
#print "dumping block from file %s (ts: %s)" % (pref, self.files[pref]["time"]) | |
while True: | |
# we may have cached data for this file, read while | |
# seeking for the next Log started line. Dump this | |
# first. | |
if len(self.files[pref]["buffer"]) > 0: | |
l = self.files[pref]["buffer"] | |
self.files[pref]["buffer"] = u'' | |
else: | |
l = self.files[pref]["fp"].readline() | |
#print type(l), dump(l) | |
if len(l) == 0: | |
#print "end of file %s, forcing new election" % filename | |
self.files[pref]["closed"] = True | |
pref = None | |
break | |
try: | |
output.write( l ) | |
except UnicodeEncodeError, e: | |
print type(l), dump(l) | |
print e | |
raise Exception | |
if l.startswith('--- Log closed'): | |
#print "end of block, forcing new election" | |
pref = None | |
break | |
def parse_timestamp(self, timestring): | |
# from http://www.python.org/doc/2.5.2/lib/node745.html : | |
# .. If, when coding a module for general use, you need a locale | |
# independent version of an operation that is affected by the | |
# locale (such as string.lower(), or certain formats used with | |
# time.strftime()), you will have to find a way to do it without | |
# using the standard library routine. Even better is convincing | |
# yourself that using locale settings is okay. Only as a last | |
# resort should you document that your module is not compatible | |
# with non-"C" locale settings. | |
# | |
# So, here we go. :( | |
# --- Log opened Tue Mar 28 21:17:38 2006 | |
# datetime(year, month, day[, hour[, minute[, second[, microsecond[,tzinfo]]]]]) | |
# en: Sun Oct 15 23:58:13 2006 | |
# >>> time.strftime("%a %b %d %H:%M:%S %Y", time.localtime()) | |
# 'Mon Nov 17 17:47:16 2008' | |
# 'tor mar 29 13:16:47 2007' - norwegian day names used. replace | |
# enough so that strptime can recognize it as english. | |
# | |
# TODO: We have the month, the year and the date. We don't need | |
# to bother transforming and parsing the day name, as it is a | |
# function of the mentioned three. Remove it some time. | |
tdata = """ | |
# notation hell. format: tovalue = from1,from2.. | |
mon = man, ma. | |
tue = tir, ti. | |
wed = ons, on. | |
thu = tor, to. | |
fri = fre, fr. | |
# latin-1 | |
#sat = lør | |
#sun = søn | |
#sat = l\xf8r, l\xf8. | |
#sun = s\xf8n, s\xf8. | |
sat = lør, lø. | |
sun = søn, sø. | |
#, s\xc3\xb8n | |
# | |
apr = april | |
may = mai | |
jun = juni, jun. | |
jul = juli, jul. | |
aug = aug. | |
sep = sep. | |
oct = oct., okt, okt. | |
nov = nov. | |
dec = des""" | |
format = '%a %b %d %H:%M:%S %Y' | |
transforms = {} | |
for t in tdata.split("\n"): | |
if len(t) == 0: | |
continue | |
if t[0] == "#": | |
continue | |
(tovalue, keys) = t.split("=", 1) | |
if not "," in keys: | |
keys = [ keys ] | |
else: | |
keys = keys.split(",") | |
for key in keys: | |
key = unicode(key, 'utf-8') | |
transforms[ key.strip() + " " ] = tovalue.strip() + " " | |
s = None | |
for fromkey, tovalue in transforms.items(): | |
#print fromkey, tovalue | |
i = timestring.find(fromkey) | |
if i > -1: | |
#print "performing transform %s->%s" % (fromkey, tovalue) | |
timestring = timestring.replace(fromkey, tovalue) | |
s = self.time.strptime(timestring, format) | |
return s | |
# following three procedures are stolen from | |
# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/142812 Jack | |
# Trainor 2008 | |
""" dump any string to formatted hex output """ | |
def dump(s): | |
import types | |
if type(s) == types.StringType: | |
return dumpString(s) | |
elif type(s) == types.UnicodeType: | |
return dumpUnicodeString(s) | |
FILTER = ''.join([(len(repr(chr(x))) == 3) and chr(x) or '.' for x in range(256)]) | |
""" dump any string, ascii or encoded, to formatted hex output """ | |
def dumpString(src, length=16): | |
result = [] | |
for i in xrange(0, len(src), length): | |
chars = src[i:i+length] | |
hex = ' '.join(["%02x" % ord(x) for x in chars]) | |
printable = ''.join(["%s" % ((ord(x) <= 127 and FILTER[ord(x)]) or '.') for x in chars]) | |
result.append("%04x %-*s %s\n" % (i, length*3, hex, printable)) | |
return ''.join(result) | |
""" dump unicode string to formatted hex output """ | |
def dumpUnicodeString(src, length=8): | |
result = [] | |
for i in xrange(0, len(src), length): | |
unichars = src[i:i+length] | |
hex = ' '.join(["%04x" % ord(x) for x in unichars]) | |
printable = ''.join(["%s" % ((ord(x) <= 127 and FILTER[ord(x)]) or '.') for x in unichars]) | |
result.append("%04x %-*s %s\n" % (i*2, length*5, hex, printable)) | |
return ''.join(result) | |
def find_networks(paths): | |
known_networks = {} | |
for dir in paths: | |
for ent in glob.glob(dir + "/*"): | |
if not os.path.isdir(ent): | |
continue | |
networkname = os.path.basename(ent) | |
if not known_networks.has_key(networkname): | |
known_networks[ networkname ] = [] | |
known_networks[ networkname ].append( ent ) | |
return known_networks | |
def main(): | |
if len(sys.argv) < 3: | |
usage() | |
sys.exit(255) | |
# not very nice, but using getopt is a bit overkill. | |
force = False | |
forcefull = False | |
if "-f" in sys.argv: | |
force = True | |
print "Replacing files in todir if size is incorrect" | |
sys.argv.pop( sys.argv.index("-f") ) | |
if "-F" in sys.argv: | |
forcefull = True | |
force = True | |
print "Overwriting old files in todir unconditionally" | |
sys.argv.pop( sys.argv.index("-F") ) | |
todir = sys.argv[1] | |
dirs = sys.argv[2:] | |
if not os.path.isdir(todir): | |
os.makedirs(todir) | |
known_networks = find_networks(dirs) | |
sources = {} | |
for network, paths in known_networks.items(): | |
# TODO: irssi sometimes create a second network instance, | |
# after wrong use of /connect and such. Perhaps one day one | |
# should try to merge these with the original. | |
#if network.endswith("2"): | |
# continue | |
#print "finding available sources in network %s\t" % network, | |
for path in paths: | |
for file in glob.glob(path + '/*'): | |
if not os.path.isfile(file): | |
continue | |
(network, id) = file.split(os.sep)[-2:] | |
fullid = network + "/" + id | |
if not sources.has_key(fullid): | |
sources[fullid] = [] | |
sources[fullid] += [ file ] | |
for source, files in sources.items(): | |
#print "source: %s" % source | |
tofile = os.path.join(todir, source) | |
if not os.path.isdir(os.path.dirname(tofile)): | |
os.makedirs(os.path.dirname(tofile)) | |
if os.path.isfile(tofile) and not forcefull: | |
srcsize = 0 | |
for srcfile in files: | |
srcsize += os.stat(srcfile)[5] | |
if os.stat(tofile)[5] == srcsize: | |
print "Existing file %s has size equal to size of sources. Assuming file is already sorted. (-F to override)" % tofile | |
continue | |
if not force: | |
print "File %s already exist. Not overwriting. Do you need -f?" % tofile | |
continue | |
if len(files) == 1: | |
print "File %s only exist in one source, copying.." % source | |
shutil.copy(files[0], tofile) | |
continue | |
print "Merging source %s with files: %s" % (source, " ".join(files)) | |
fp = codecs.open(tofile, 'w+', 'utf-8') | |
lr = IrssiLogReader() | |
for file in files: | |
lr.addfile(file) | |
lr.run(output=fp) | |
fp.close() | |
print "All files successfully merged, normal exit" | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment