Skip to content

Instantly share code, notes, and snippets.

@kimmobrunfeldt
Created April 12, 2014 10:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kimmobrunfeldt/10528665 to your computer and use it in GitHub Desktop.
Save kimmobrunfeldt/10528665 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# -*- coding: UTF-8 -*-
#
# Written by Kimmo Brunfeldt
# IRClog stats generator.
import re
import sys
import time
import logger
import random
import string
from draw_table import draw_table
IGNORE_LINES_UNDER = 100 # Minimum lines to be examined.
PERSONAL_WORDS = 3 # How many personal words is printed out.
# Word's rate for a person is: (word occurrence) / (all words person has said)
# When two rates are compared, the actual person's word rate is divided by
# RATE_DIVIDER
# Rule: The bigger the RATE_DIVIDER, the more personal the words are.
RATE_DIVIDER = 3.0
class IrcLog(object):
def __init__(self, filename):
self.log = logger.Logger(time.time())
self.log = self.log.log
self.url_regex = re.compile(r"""https?://[^'"<>\s]+""")
# This regex is used to detect which lines are messages.
# 05.12.2011 00:29:51 < nick>: message :D
self.msg_regex = re.compile(r'^[^ ]{10} [^ ]{8} <.+?>:')
self.log('Reading irclog..')
self.lines = open(filename).read().splitlines()
self.log('%s lines read.'%len(self.lines))
self.log('Filter everything else than messages..')
self.lines = [x for x in self.lines if self.msg_regex.match(x) is not None]
self.log('Converting lines to dictionary..')
self.lines_by_nick = {}
self.realnicks = {}
self.charactercount = {}
self.specialchars = {}
self.links = {}
splitMessage = self.splitMessage
for line in self.lines:
splitMessage(line)
self.lines = self.lines_by_nick
self.hilights = dict((nick, {}) for nick in self.realnicks.keys())
self.getWordsByNick()
def splitMessage(self, line):
"""Split an irclog line to timestamp, nick and line and save it to
a dictionary"""
line = line.split('>:')
part1 = line[0]
part2 = '>:'.join(line[1:])[1:]
timestamp = part1[0:19]
realnick = part1[22:]
nick = realnick.lower()
self.realnicks[nick] = realnick
urls = len(self.url_regex.findall(part2))
if urls > 0:
if not self.links.has_key(nick):
self.links[nick] = 1
else:
self.links[nick] += 1
# Count characters here.
if not self.charactercount.has_key(nick):
self.specialchars[nick] = 0
for char in part2:
if char in string.punctuation:
self.specialchars[nick] += 1
self.charactercount[nick] = len(part2)
else:
if not self.specialchars.has_key(nick):
self.specialchars[nick] = 0
for char in part2:
if char in string.punctuation:
self.specialchars[nick] += 1
self.charactercount[nick] += len(part2)
if nick not in self.lines_by_nick:
self.lines_by_nick[nick] = [ (timestamp, part2), ]
else:
self.lines_by_nick[nick].append( (timestamp, part2), )
def printStats(self, sort_column=None):
self.log('Finding personal words..')
# Add headers to table.
stats = [['Nick', 'Lines', 'Words', 'Characters', 'WPL', 'URLS', 'Best friend', 'Most used', 'Personal words']]
for nick, lines in self.lines.items():
realnick = self.realnicks[nick] # Nick in correct case.
linecount = len(lines) # Amount of lines.
if linecount < IGNORE_LINES_UNDER:
continue
wordcount = sum(y for x,y in self.words[nick].items()) # Amount of words.
characters = self.charactercount[nick] # Amount of chars.
wordsperline = '%.2f'% (float(wordcount) / linecount)
personalwords = self.getPersonalWords(nick)
try:
urls = self.links[nick]
except KeyError:
urls = 0
scpercent = '%.2f' % (self.specialchars[nick] / float(characters) * 100)
hilights = [(y, x) for x,y in self.hilights[nick].items()]
if len(hilights) > 0:
hilights.sort()
friend = self.realnicks[hilights[-1][1]]
else:
friend = 'Nobody'
top100 = [x[1] for x in self.allwords[-50:]]
mostused = [(y, x) for x,y in self.words[nick].items() if x not in top100 and len(x) > 1 and x.replace(',','').replace(':','').replace('>','') not in self.realnicks]
mostused.sort()
used = '"' + '", "'.join([x[1] for x in random.sample(mostused[-30:], 3)]) + '"'
try:
personalwords = '"' + '", "'.join(random.sample(personalwords, PERSONAL_WORDS)) + '"'
except ValueError:
print('You have to lower PERSONAL_WORDS or RATE_DIVIDER!')
sys.exit(1)
info = [realnick, linecount, wordcount, characters, wordsperline, urls, friend, used, personalwords]
stats.append(info)
draw_table(stats, sort_column, reverse=True)
def getPersonalWords(self, nick):
all_personal = []
nick_all_words = len(self.words[nick])
for nick_word, nick_count in self.words[nick].items():
if self.url_regex.match(nick_word) is not None:
continue
nick_word_rate = float(nick_count) / nick_all_words
is_personal = True
some_one_has_said = False
for compare_nick, compare_words in self.words.items():
if compare_nick == nick: # Same nick, skip to next one.
continue
compare_all_words = len(self.words[compare_nick])
if compare_words.has_key(nick_word): # The word has been said by compare_nick.
some_one_has_said = True
compare_count = compare_words[nick_word]
compare_word_rate = float(compare_count) / compare_all_words
if compare_word_rate >= (nick_word_rate / RATE_DIVIDER):
is_personal = False
break # Skip to next word, if even one nick has higher
# rate, it is not personal word.
if is_personal and some_one_has_said:
all_personal.append(nick_word)
return all_personal
def getWordsByNick(self):
self.log('Getting words for nicks.')
self.words = dict((nick, {}) for nick in self.lines.keys())
self.allwords = {}
for nick, lines in self.lines.items():
for line in lines:
for word in line[1].split():
word = word.lower()
if self.words[nick].has_key(word):
self.words[nick][word] += 1
else:
self.words[nick][word] = 1
if self.allwords.has_key(word):
self.allwords[word] += 1
else:
self.allwords[word] = 1
stripped = word.replace(':','').replace(',','')
if stripped in self.realnicks.keys(): # its a nickname
if not self.hilights[nick].has_key(stripped):
self.hilights[nick][stripped] = 1
else:
self.hilights[nick][stripped] += 1
self.allwords = [(y, x) for x, y in self.allwords.items()]
self.allwords.sort()
if __name__ == '__main__':
print('Logging started in Wed Aug 17 19:44:49 2011')
print('')
print('Most used = Most used words that are NOT in channel\'s top50 words(and is not a nickname)')
print('WPL = Words per line')
i = IrcLog(sys.argv[1])
i.printStats(sort_column=random.choice([3])) # Sort by characters.
#print('SC-% = How big a percentage of all characters are special characters.')
i.log('End.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment