Preprocessing of #tikiwiki logs for use with the chat disentangler available at http://www.ling.ohio-state.edu/~melsner/resources/chat-manual.html
#!/usr/bin/env python | |
#converts a gaim chatlog to a more ethical anonymized version | |
#format of the output is | |
#[datestamp timestamp] <name> comment | |
#or | |
#[datestamp timestamp] *** name action | |
from random import shuffle | |
from sys import argv | |
import re | |
#from time import * | |
import datetime | |
def timeToSecs(timeStr, prevTime): | |
t = datetime.datetime.strptime(timeStr, "%Y-%m-%d %H:%M") | |
seconds = long((t-datetime.datetime(1970,1,1)).total_seconds()) | |
#(year,month,day,hr,min) = timeTuple[1:5] | |
while seconds <= prevTime: | |
seconds += 1 | |
return seconds | |
#read the names list (got from the US census and preprocessed a little bit) | |
nameFile = "data/names" | |
names = [x.rstrip().title() for x in file(nameFile).readlines()] | |
shuffle(names) | |
#load up the file | |
chatFile = argv[1] | |
chat = file(chatFile) | |
print "Processing", chatFile | |
aliases = {} | |
#the intro line (my name, server id, true start time) | |
#intro = chat.readline() | |
#trueTime = re.search("at \S+ ([\d:]+)", intro) | |
#assert(trueTime) | |
#trueTime = timeToSecs(trueTime.group(1), 0) | |
#channelName = re.search("Conversation with (\S+)", intro) | |
#assert(channelName) | |
channelName = "tiki" #channelName.group(1) | |
aliases[channelName] = channelName | |
#epoch = trueTime | |
trueTime = 0 | |
basicRE = re.compile("\[([^\]]+)\] ((<([^>]+)>)|\*\*?\*? ([^\s]+))(.*)") | |
for line in chat: | |
match = basicRE.match(line) | |
assert(match) | |
(time, full_name, name_comment, ignore, action_name, rest) = match.groups() | |
name = name_comment if name_comment else action_name | |
#print line, "time", time, "full_name", full_name, "name_comment", name_comment, "action_name", action_name, "ignore", ignore, "rest", rest | |
trueTime = timeToSecs(time, trueTime) | |
try: | |
alias = aliases[name] | |
except KeyError: | |
alias = names.pop() | |
aliases[name] = alias | |
if "is now known as" not in rest: | |
#obnoxiously, people can readopt others' nicknames | |
#and then you get cross-aliasing | |
for name in aliases.keys(): | |
if name in rest: | |
namepatt = re.compile("(^|[^a-zA-Z]+)%s([^a-zA-Z]+|$)" % | |
re.escape(name)) | |
if re.search(namepatt, rest): | |
rest = re.sub(namepatt, r"\1%s\2" % aliases[name], rest) | |
if not '*' in full_name: | |
print trueTime, alias, ":", rest | |
else: | |
if rest.endswith("has joined #tikiwiki"): | |
rest = " entered the room." | |
elif "is now known as" in rest: | |
newName = re.search("is now known as ([^\s:]+)", rest) | |
assert(newName) | |
newName = newName.group(1) | |
aliases[newName] = alias | |
rest = rest.replace(newName, alias) | |
print trueTime, alias, "*", rest |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment