Skip to content

Instantly share code, notes, and snippets.

@DrDub
Created April 22, 2013 01:53
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save DrDub/5431945 to your computer and use it in GitHub Desktop.
Preprocessing of #tikiwiki logs for use with the chat disentangler available at http://www.ling.ohio-state.edu/~melsner/resources/chat-manual.html
#!/usr/bin/env python
#converts a gaim chatlog to a more ethical anonymized version
#format of the output is
#[datestamp timestamp] <name> comment
#or
#[datestamp timestamp] *** name action
from random import shuffle
from sys import argv
import re
#from time import *
import datetime
def timeToSecs(timeStr, prevTime):
t = datetime.datetime.strptime(timeStr, "%Y-%m-%d %H:%M")
seconds = long((t-datetime.datetime(1970,1,1)).total_seconds())
#(year,month,day,hr,min) = timeTuple[1:5]
while seconds <= prevTime:
seconds += 1
return seconds
#read the names list (got from the US census and preprocessed a little bit)
nameFile = "data/names"
names = [x.rstrip().title() for x in file(nameFile).readlines()]
shuffle(names)
#load up the file
chatFile = argv[1]
chat = file(chatFile)
print "Processing", chatFile
aliases = {}
#the intro line (my name, server id, true start time)
#intro = chat.readline()
#trueTime = re.search("at \S+ ([\d:]+)", intro)
#assert(trueTime)
#trueTime = timeToSecs(trueTime.group(1), 0)
#channelName = re.search("Conversation with (\S+)", intro)
#assert(channelName)
channelName = "tiki" #channelName.group(1)
aliases[channelName] = channelName
#epoch = trueTime
trueTime = 0
basicRE = re.compile("\[([^\]]+)\] ((<([^>]+)>)|\*\*?\*? ([^\s]+))(.*)")
for line in chat:
match = basicRE.match(line)
assert(match)
(time, full_name, name_comment, ignore, action_name, rest) = match.groups()
name = name_comment if name_comment else action_name
#print line, "time", time, "full_name", full_name, "name_comment", name_comment, "action_name", action_name, "ignore", ignore, "rest", rest
trueTime = timeToSecs(time, trueTime)
try:
alias = aliases[name]
except KeyError:
alias = names.pop()
aliases[name] = alias
if "is now known as" not in rest:
#obnoxiously, people can readopt others' nicknames
#and then you get cross-aliasing
for name in aliases.keys():
if name in rest:
namepatt = re.compile("(^|[^a-zA-Z]+)%s([^a-zA-Z]+|$)" %
re.escape(name))
if re.search(namepatt, rest):
rest = re.sub(namepatt, r"\1%s\2" % aliases[name], rest)
if not '*' in full_name:
print trueTime, alias, ":", rest
else:
if rest.endswith("has joined #tikiwiki"):
rest = " entered the room."
elif "is now known as" in rest:
newName = re.search("is now known as ([^\s:]+)", rest)
assert(newName)
newName = newName.group(1)
aliases[newName] = alias
rest = rest.replace(newName, alias)
print trueTime, alias, "*", rest
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment