kimmobrunfeldt/analyzer.py

## analyzer.py
#!/usr/bin/python
# -*- coding: UTF-8 -*-
#
# Written by Kimmo Brunfeldt
# IRClog stats generator.

import re
import sys
import time
import logger
import random
import string
from draw_table import draw_table

IGNORE_LINES_UNDER = 100  # Minimum lines to be examined.
PERSONAL_WORDS = 3        # How many personal words is printed out.

# Word's rate for a person is: (word occurrence) / (all words person has said)
# When two rates are compared, the actual person's word rate is divided by
# RATE_DIVIDER
# Rule: The bigger the RATE_DIVIDER, the more personal the words are.
RATE_DIVIDER = 3.0

class IrcLog(object):

    def __init__(self, filename):

        self.log = logger.Logger(time.time())
        self.log = self.log.log

        self.url_regex = re.compile(r"""https?://[^'"<>\s]+""")

        # This regex is used to detect which lines are messages.
        # 05.12.2011 00:29:51 < nick>: message :D
        self.msg_regex = re.compile(r'^[^ ]{10} [^ ]{8} <.+?>:')

        self.log('Reading irclog..')
        self.lines = open(filename).read().splitlines()
        self.log('%s lines read.'%len(self.lines))

        self.log('Filter everything else than messages..')
        self.lines = [x for x in self.lines if self.msg_regex.match(x) is not None]

        self.log('Converting lines to dictionary..')
        self.lines_by_nick = {}
        self.realnicks = {}
        self.charactercount = {}
        self.specialchars = {}
        self.links = {}

        splitMessage = self.splitMessage
        for line in self.lines:
            splitMessage(line)

        self.lines = self.lines_by_nick

        self.hilights = dict((nick, {}) for nick in self.realnicks.keys())
        self.getWordsByNick()

    def splitMessage(self, line):
        """Split an irclog line to timestamp, nick and line and save it to
        a dictionary"""

        line = line.split('>:')
        part1 = line[0]
        part2 = '>:'.join(line[1:])[1:]

        timestamp = part1[0:19]

        realnick = part1[22:]
        nick = realnick.lower()
        self.realnicks[nick] = realnick

        urls = len(self.url_regex.findall(part2))
        if urls > 0:
            if not self.links.has_key(nick):
                self.links[nick] = 1
            else:
                self.links[nick] += 1

        # Count characters here.
        if not self.charactercount.has_key(nick):
            self.specialchars[nick] = 0
            for char in part2:
                if char in string.punctuation:
                    self.specialchars[nick] += 1

            self.charactercount[nick] = len(part2)
        else:

            if not self.specialchars.has_key(nick):
                self.specialchars[nick] = 0

            for char in part2:
                if char in string.punctuation:
                    self.specialchars[nick] += 1

            self.charactercount[nick] += len(part2)

        if nick not in self.lines_by_nick:
            self.lines_by_nick[nick] = [ (timestamp, part2), ]
        else:
            self.lines_by_nick[nick].append( (timestamp, part2), )

    def printStats(self, sort_column=None):

        self.log('Finding personal words..')
        # Add headers to table.
        stats = [['Nick', 'Lines', 'Words', 'Characters', 'WPL', 'URLS', 'Best friend', 'Most used', 'Personal words']]

        for nick, lines in self.lines.items():
            realnick = self.realnicks[nick]          # Nick in correct case.
            linecount = len(lines)                   # Amount of lines.
            if linecount < IGNORE_LINES_UNDER:
                continue

            wordcount = sum(y for x,y in self.words[nick].items())        # Amount of words.
            characters = self.charactercount[nick]                        # Amount of chars.

            wordsperline = '%.2f'% (float(wordcount) / linecount)
            personalwords = self.getPersonalWords(nick)
            try:
                urls = self.links[nick]
            except KeyError:
                urls = 0

            scpercent = '%.2f' % (self.specialchars[nick] / float(characters) * 100)

            hilights = [(y, x) for x,y in self.hilights[nick].items()]
            if len(hilights) > 0:
                hilights.sort()
                friend = self.realnicks[hilights[-1][1]]

            else:
                friend = 'Nobody'

            top100 = [x[1] for x in self.allwords[-50:]]

            mostused = [(y, x) for x,y in self.words[nick].items() if x not in top100 and len(x) > 1 and x.replace(',','').replace(':','').replace('>','') not in self.realnicks]
            mostused.sort()

            used = '"' + '", "'.join([x[1] for x in random.sample(mostused[-30:], 3)]) + '"'

            try:
                personalwords = '"' + '", "'.join(random.sample(personalwords, PERSONAL_WORDS)) + '"'
            except ValueError:
                print('You have to lower PERSONAL_WORDS or RATE_DIVIDER!')
                sys.exit(1)

            info = [realnick, linecount, wordcount, characters, wordsperline, urls, friend, used, personalwords]
            stats.append(info)

        draw_table(stats, sort_column, reverse=True)

    def getPersonalWords(self, nick):

        all_personal = []
        nick_all_words = len(self.words[nick])
        for nick_word, nick_count in self.words[nick].items():

            if self.url_regex.match(nick_word) is not None:
                continue

            nick_word_rate = float(nick_count) / nick_all_words

            is_personal = True
            some_one_has_said = False
            for compare_nick, compare_words in self.words.items():

                if compare_nick == nick:  # Same nick, skip to next one.
                    continue

                compare_all_words = len(self.words[compare_nick])

                if compare_words.has_key(nick_word):  # The word has been said by compare_nick.

                    some_one_has_said = True
                    compare_count = compare_words[nick_word]
                    compare_word_rate = float(compare_count) / compare_all_words

                    if compare_word_rate >= (nick_word_rate / RATE_DIVIDER):
                        is_personal = False
                        break  # Skip to next word, if even one nick has higher
                               # rate, it is not personal word.

            if is_personal and some_one_has_said:
                all_personal.append(nick_word)

        return all_personal

    def getWordsByNick(self):

        self.log('Getting words for nicks.')

        self.words = dict((nick, {}) for nick in self.lines.keys())
        self.allwords = {}

        for nick, lines in self.lines.items():
            for line in lines:
                for word in line[1].split():
                    word = word.lower()
                    if self.words[nick].has_key(word):
                        self.words[nick][word] += 1

                    else:
                        self.words[nick][word] = 1

                    if self.allwords.has_key(word):
                        self.allwords[word] += 1

                    else:
                        self.allwords[word] = 1

                    stripped = word.replace(':','').replace(',','')
                    if stripped in self.realnicks.keys():  # its a nickname

                        if not self.hilights[nick].has_key(stripped):
                            self.hilights[nick][stripped] = 1
                        else:
                            self.hilights[nick][stripped] += 1
        self.allwords = [(y, x) for x, y in self.allwords.items()]
        self.allwords.sort()

if __name__ == '__main__':

    print('Logging started in Wed Aug 17 19:44:49 2011')
    print('')
    print('Most used = Most used words that are NOT in channel\'s top50 words(and is not a nickname)')
    print('WPL = Words per line')
    i = IrcLog(sys.argv[1])
    i.printStats(sort_column=random.choice([3])) # Sort by characters.

    #print('SC-% = How big a percentage of all characters are special characters.')
    i.log('End.')
	#!/usr/bin/python
	# -- coding: UTF-8 --
	#
	# Written by Kimmo Brunfeldt
	# IRClog stats generator.

	import re
	import sys
	import time
	import logger
	import random
	import string
	from draw_table import draw_table

	IGNORE_LINES_UNDER = 100 # Minimum lines to be examined.
	PERSONAL_WORDS = 3 # How many personal words is printed out.

	# Word's rate for a person is: (word occurrence) / (all words person has said)
	# When two rates are compared, the actual person's word rate is divided by
	# RATE_DIVIDER
	# Rule: The bigger the RATE_DIVIDER, the more personal the words are.
	RATE_DIVIDER = 3.0

	class IrcLog(object):

	def __init__(self, filename):

	self.log = logger.Logger(time.time())
	self.log = self.log.log

	self.url_regex = re.compile(r"""https?://[^'"<>\s]+""")

	# This regex is used to detect which lines are messages.
	# 05.12.2011 00:29:51 < nick>: message :D
	self.msg_regex = re.compile(r'^[^ ]{10} [^ ]{8} <.+?>:')

	self.log('Reading irclog..')
	self.lines = open(filename).read().splitlines()
	self.log('%s lines read.'%len(self.lines))

	self.log('Filter everything else than messages..')
	self.lines = [x for x in self.lines if self.msg_regex.match(x) is not None]

	self.log('Converting lines to dictionary..')
	self.lines_by_nick = {}
	self.realnicks = {}
	self.charactercount = {}
	self.specialchars = {}
	self.links = {}

	splitMessage = self.splitMessage
	for line in self.lines:
	splitMessage(line)

	self.lines = self.lines_by_nick

	self.hilights = dict((nick, {}) for nick in self.realnicks.keys())
	self.getWordsByNick()

	def splitMessage(self, line):
	"""Split an irclog line to timestamp, nick and line and save it to
	a dictionary"""

	line = line.split('>:')
	part1 = line[0]
	part2 = '>:'.join(line[1:])[1:]

	timestamp = part1[0:19]

	realnick = part1[22:]
	nick = realnick.lower()
	self.realnicks[nick] = realnick

	urls = len(self.url_regex.findall(part2))
	if urls > 0:
	if not self.links.has_key(nick):
	self.links[nick] = 1
	else:
	self.links[nick] += 1

	# Count characters here.
	if not self.charactercount.has_key(nick):
	self.specialchars[nick] = 0
	for char in part2:
	if char in string.punctuation:
	self.specialchars[nick] += 1

	self.charactercount[nick] = len(part2)
	else:

	if not self.specialchars.has_key(nick):
	self.specialchars[nick] = 0

	for char in part2:
	if char in string.punctuation:
	self.specialchars[nick] += 1

	self.charactercount[nick] += len(part2)

	if nick not in self.lines_by_nick:
	self.lines_by_nick[nick] = [ (timestamp, part2), ]
	else:
	self.lines_by_nick[nick].append( (timestamp, part2), )

	def printStats(self, sort_column=None):

	self.log('Finding personal words..')
	# Add headers to table.
	stats = [['Nick', 'Lines', 'Words', 'Characters', 'WPL', 'URLS', 'Best friend', 'Most used', 'Personal words']]

	for nick, lines in self.lines.items():
	realnick = self.realnicks[nick] # Nick in correct case.
	linecount = len(lines) # Amount of lines.
	if linecount < IGNORE_LINES_UNDER:
	continue

	wordcount = sum(y for x,y in self.words[nick].items()) # Amount of words.
	characters = self.charactercount[nick] # Amount of chars.

	wordsperline = '%.2f'% (float(wordcount) / linecount)
	personalwords = self.getPersonalWords(nick)
	try:
	urls = self.links[nick]
	except KeyError:
	urls = 0

	scpercent = '%.2f' % (self.specialchars[nick] / float(characters) * 100)

	hilights = [(y, x) for x,y in self.hilights[nick].items()]
	if len(hilights) > 0:
	hilights.sort()
	friend = self.realnicks[hilights[-1][1]]

	else:
	friend = 'Nobody'

	top100 = [x[1] for x in self.allwords[-50:]]

	mostused = [(y, x) for x,y in self.words[nick].items() if x not in top100 and len(x) > 1 and x.replace(',','').replace(':','').replace('>','') not in self.realnicks]
	mostused.sort()

	used = '"' + '", "'.join([x[1] for x in random.sample(mostused[-30:], 3)]) + '"'

	try:
	personalwords = '"' + '", "'.join(random.sample(personalwords, PERSONAL_WORDS)) + '"'
	except ValueError:
	print('You have to lower PERSONAL_WORDS or RATE_DIVIDER!')
	sys.exit(1)

	info = [realnick, linecount, wordcount, characters, wordsperline, urls, friend, used, personalwords]
	stats.append(info)

	draw_table(stats, sort_column, reverse=True)

	def getPersonalWords(self, nick):

	all_personal = []
	nick_all_words = len(self.words[nick])
	for nick_word, nick_count in self.words[nick].items():

	if self.url_regex.match(nick_word) is not None:
	continue

	nick_word_rate = float(nick_count) / nick_all_words

	is_personal = True
	some_one_has_said = False
	for compare_nick, compare_words in self.words.items():

	if compare_nick == nick: # Same nick, skip to next one.
	continue

	compare_all_words = len(self.words[compare_nick])

	if compare_words.has_key(nick_word): # The word has been said by compare_nick.

	some_one_has_said = True
	compare_count = compare_words[nick_word]
	compare_word_rate = float(compare_count) / compare_all_words

	if compare_word_rate >= (nick_word_rate / RATE_DIVIDER):
	is_personal = False
	break # Skip to next word, if even one nick has higher
	# rate, it is not personal word.

	if is_personal and some_one_has_said:
	all_personal.append(nick_word)

	return all_personal

	def getWordsByNick(self):

	self.log('Getting words for nicks.')

	self.words = dict((nick, {}) for nick in self.lines.keys())
	self.allwords = {}

	for nick, lines in self.lines.items():
	for line in lines:
	for word in line[1].split():
	word = word.lower()
	if self.words[nick].has_key(word):
	self.words[nick][word] += 1

	else:
	self.words[nick][word] = 1

	if self.allwords.has_key(word):
	self.allwords[word] += 1

	else:
	self.allwords[word] = 1

	stripped = word.replace(':','').replace(',','')
	if stripped in self.realnicks.keys(): # its a nickname

	if not self.hilights[nick].has_key(stripped):
	self.hilights[nick][stripped] = 1
	else:
	self.hilights[nick][stripped] += 1
	self.allwords = [(y, x) for x, y in self.allwords.items()]
	self.allwords.sort()

	if __name__ == '__main__':

	print('Logging started in Wed Aug 17 19:44:49 2011')
	print('')
	print('Most used = Most used words that are NOT in channel\'s top50 words(and is not a nickname)')
	print('WPL = Words per line')
	i = IrcLog(sys.argv[1])
	i.printStats(sort_column=random.choice([3])) # Sort by characters.

	#print('SC-% = How big a percentage of all characters are special characters.')
	i.log('End.')