Created
April 12, 2014 10:21
-
-
Save kimmobrunfeldt/10528665 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: UTF-8 -*- | |
# | |
# Written by Kimmo Brunfeldt | |
# IRClog stats generator. | |
import re | |
import sys | |
import time | |
import logger | |
import random | |
import string | |
from draw_table import draw_table | |
IGNORE_LINES_UNDER = 100 # Minimum lines to be examined. | |
PERSONAL_WORDS = 3 # How many personal words is printed out. | |
# Word's rate for a person is: (word occurrence) / (all words person has said) | |
# When two rates are compared, the actual person's word rate is divided by | |
# RATE_DIVIDER | |
# Rule: The bigger the RATE_DIVIDER, the more personal the words are. | |
RATE_DIVIDER = 3.0 | |
class IrcLog(object): | |
def __init__(self, filename): | |
self.log = logger.Logger(time.time()) | |
self.log = self.log.log | |
self.url_regex = re.compile(r"""https?://[^'"<>\s]+""") | |
# This regex is used to detect which lines are messages. | |
# 05.12.2011 00:29:51 < nick>: message :D | |
self.msg_regex = re.compile(r'^[^ ]{10} [^ ]{8} <.+?>:') | |
self.log('Reading irclog..') | |
self.lines = open(filename).read().splitlines() | |
self.log('%s lines read.'%len(self.lines)) | |
self.log('Filter everything else than messages..') | |
self.lines = [x for x in self.lines if self.msg_regex.match(x) is not None] | |
self.log('Converting lines to dictionary..') | |
self.lines_by_nick = {} | |
self.realnicks = {} | |
self.charactercount = {} | |
self.specialchars = {} | |
self.links = {} | |
splitMessage = self.splitMessage | |
for line in self.lines: | |
splitMessage(line) | |
self.lines = self.lines_by_nick | |
self.hilights = dict((nick, {}) for nick in self.realnicks.keys()) | |
self.getWordsByNick() | |
def splitMessage(self, line): | |
"""Split an irclog line to timestamp, nick and line and save it to | |
a dictionary""" | |
line = line.split('>:') | |
part1 = line[0] | |
part2 = '>:'.join(line[1:])[1:] | |
timestamp = part1[0:19] | |
realnick = part1[22:] | |
nick = realnick.lower() | |
self.realnicks[nick] = realnick | |
urls = len(self.url_regex.findall(part2)) | |
if urls > 0: | |
if not self.links.has_key(nick): | |
self.links[nick] = 1 | |
else: | |
self.links[nick] += 1 | |
# Count characters here. | |
if not self.charactercount.has_key(nick): | |
self.specialchars[nick] = 0 | |
for char in part2: | |
if char in string.punctuation: | |
self.specialchars[nick] += 1 | |
self.charactercount[nick] = len(part2) | |
else: | |
if not self.specialchars.has_key(nick): | |
self.specialchars[nick] = 0 | |
for char in part2: | |
if char in string.punctuation: | |
self.specialchars[nick] += 1 | |
self.charactercount[nick] += len(part2) | |
if nick not in self.lines_by_nick: | |
self.lines_by_nick[nick] = [ (timestamp, part2), ] | |
else: | |
self.lines_by_nick[nick].append( (timestamp, part2), ) | |
def printStats(self, sort_column=None): | |
self.log('Finding personal words..') | |
# Add headers to table. | |
stats = [['Nick', 'Lines', 'Words', 'Characters', 'WPL', 'URLS', 'Best friend', 'Most used', 'Personal words']] | |
for nick, lines in self.lines.items(): | |
realnick = self.realnicks[nick] # Nick in correct case. | |
linecount = len(lines) # Amount of lines. | |
if linecount < IGNORE_LINES_UNDER: | |
continue | |
wordcount = sum(y for x,y in self.words[nick].items()) # Amount of words. | |
characters = self.charactercount[nick] # Amount of chars. | |
wordsperline = '%.2f'% (float(wordcount) / linecount) | |
personalwords = self.getPersonalWords(nick) | |
try: | |
urls = self.links[nick] | |
except KeyError: | |
urls = 0 | |
scpercent = '%.2f' % (self.specialchars[nick] / float(characters) * 100) | |
hilights = [(y, x) for x,y in self.hilights[nick].items()] | |
if len(hilights) > 0: | |
hilights.sort() | |
friend = self.realnicks[hilights[-1][1]] | |
else: | |
friend = 'Nobody' | |
top100 = [x[1] for x in self.allwords[-50:]] | |
mostused = [(y, x) for x,y in self.words[nick].items() if x not in top100 and len(x) > 1 and x.replace(',','').replace(':','').replace('>','') not in self.realnicks] | |
mostused.sort() | |
used = '"' + '", "'.join([x[1] for x in random.sample(mostused[-30:], 3)]) + '"' | |
try: | |
personalwords = '"' + '", "'.join(random.sample(personalwords, PERSONAL_WORDS)) + '"' | |
except ValueError: | |
print('You have to lower PERSONAL_WORDS or RATE_DIVIDER!') | |
sys.exit(1) | |
info = [realnick, linecount, wordcount, characters, wordsperline, urls, friend, used, personalwords] | |
stats.append(info) | |
draw_table(stats, sort_column, reverse=True) | |
def getPersonalWords(self, nick): | |
all_personal = [] | |
nick_all_words = len(self.words[nick]) | |
for nick_word, nick_count in self.words[nick].items(): | |
if self.url_regex.match(nick_word) is not None: | |
continue | |
nick_word_rate = float(nick_count) / nick_all_words | |
is_personal = True | |
some_one_has_said = False | |
for compare_nick, compare_words in self.words.items(): | |
if compare_nick == nick: # Same nick, skip to next one. | |
continue | |
compare_all_words = len(self.words[compare_nick]) | |
if compare_words.has_key(nick_word): # The word has been said by compare_nick. | |
some_one_has_said = True | |
compare_count = compare_words[nick_word] | |
compare_word_rate = float(compare_count) / compare_all_words | |
if compare_word_rate >= (nick_word_rate / RATE_DIVIDER): | |
is_personal = False | |
break # Skip to next word, if even one nick has higher | |
# rate, it is not personal word. | |
if is_personal and some_one_has_said: | |
all_personal.append(nick_word) | |
return all_personal | |
def getWordsByNick(self): | |
self.log('Getting words for nicks.') | |
self.words = dict((nick, {}) for nick in self.lines.keys()) | |
self.allwords = {} | |
for nick, lines in self.lines.items(): | |
for line in lines: | |
for word in line[1].split(): | |
word = word.lower() | |
if self.words[nick].has_key(word): | |
self.words[nick][word] += 1 | |
else: | |
self.words[nick][word] = 1 | |
if self.allwords.has_key(word): | |
self.allwords[word] += 1 | |
else: | |
self.allwords[word] = 1 | |
stripped = word.replace(':','').replace(',','') | |
if stripped in self.realnicks.keys(): # its a nickname | |
if not self.hilights[nick].has_key(stripped): | |
self.hilights[nick][stripped] = 1 | |
else: | |
self.hilights[nick][stripped] += 1 | |
self.allwords = [(y, x) for x, y in self.allwords.items()] | |
self.allwords.sort() | |
if __name__ == '__main__': | |
print('Logging started in Wed Aug 17 19:44:49 2011') | |
print('') | |
print('Most used = Most used words that are NOT in channel\'s top50 words(and is not a nickname)') | |
print('WPL = Words per line') | |
i = IrcLog(sys.argv[1]) | |
i.printStats(sort_column=random.choice([3])) # Sort by characters. | |
#print('SC-% = How big a percentage of all characters are special characters.') | |
i.log('End.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment