ayust/convofinder.py

## convofinder.py
"""Analyze a log of a text communication, looking for distinct discussions."""
from collections import Counter, deque
import re
import string
import sys


LINE_RE = re.compile(r"^(?P<timestamp>[\d:]+)\s"
                     r"<\W?(?P<nick>[\w|^`[\]]+)>\s"
                     r"(?P<message>.*)$")


def get_loglines(logfile):
  """Given a file-like object, grab parsed IRC log lines as a list."""
  loglines = []
  for line in logfile:
    m = LINE_RE.match(line.strip())
    if not m:
      continue
    groups = m.groupdict()
    # Convert a file's HH:MM or HH:MM:SS timestamps to an integer number
    # for ease of arithmetic - I don't actually care about whether the
    # resulting value is minutes or seconds as long as it's consistent
    groups['timestamp'] = sum(
        (60 ** i) * int(value.lstrip('0') or 0) for i,value in
        enumerate(reversed(groups['timestamp'].split(':'))))
    loglines.append(groups)
  return loglines


def break_lines_by_time(lines, min_separation=10, min_deltafrac=3.0, window=5):
  """Break a set of lines into sub-sets of lines based on timestamps."""
  assert lines

  blocks = []
  line_buffer = []
  last_timestamps = deque([0]*window, maxlen=window)
  last_deltas = deque([0]*window, maxlen=window)

  for line in lines:
    prev_time = last_timestamps[-1]
    time_delta = line['timestamp'] - prev_time
    max_allowed_delta = sum(last_deltas) * (min_deltafrac / window)
    line['max_delta'] = max_allowed_delta
    line['delta'] = time_delta
    if time_delta > max_allowed_delta and time_delta > min_separation:
      if line_buffer:
        blocks.append(line_buffer)
        line_buffer = []
      last_timestamps.extend([line['timestamp']]*window)
    else:
      last_timestamps.append(line['timestamp'])
      last_deltas.append(time_delta)
    line_buffer.append(line)

  if line_buffer:
    blocks.append(line_buffer)

  return blocks


def find_convos(logfile):
  """Identify conversations from a given logfile."""
  lines = get_loglines(logfile)

  allfreqs = Counter()
  for line in lines:
    bits = (x.lower().strip('.?"!,;:')
        for x in line['message'].split())
    bits = (b for b in bits
        if len(b) > 2 and b[0] in string.lowercase)
    bits = (b for b in bits if not b.startswith('http'))
    allfreqs.update(bits)

  time_convos = break_lines_by_time(lines)

  docfreqs = Counter()
  for convo in time_convos:
    bits = set()
    for line in convo:
      bits.update(x.lower().strip('.?"!,:;') for x in line['message'].split())
    docfreqs.update(bits)

  for convo in time_convos:
    convofreqs = Counter()
    nickfreqs = Counter()
    for line in convo:
      #print line['timestamp'], line['nick'], "-->", line['message'], "<<>>", line['delta'], line['max_delta']
      convofreqs.update(x.lower().strip('.?"!,:;') for x in line['message'].split())
      nickfreqs.update([line['nick']])

    signifs = sorted(((float(convofreqs[k]) / (docfreqs[k] ** 2), k)
      for k in convofreqs if allfreqs[k] > 0), reverse=True)

    duration = convo[-1]['timestamp'] - convo[0]['timestamp']

    if duration > 5:
      print "="*80
      print convo[0]['timestamp'], "to", convo[-1]['timestamp'], ", duration:", duration
      print "Keywords:", ', '.join(x[1] for x in signifs[:10])
      print "Speakers:", ', '.join("%s(%d)" % (x[0], x[1]) for x in nickfreqs.most_common(10))
      print "="*80


def main():
  filename = sys.argv[1]
  with open(filename) as f:
    find_convos(f)


if __name__ == "__main__":
  main()
	"""Analyze a log of a text communication, looking for distinct discussions."""
	from collections import Counter, deque
	import re
	import string
	import sys


	LINE_RE = re.compile(r"^(?P<timestamp>[\d:]+)\s"
	r"<\W?(?P<nick>[\w\|^`[\]]+)>\s"
	r"(?P<message>.*)$")


	def get_loglines(logfile):
	"""Given a file-like object, grab parsed IRC log lines as a list."""
	loglines = []
	for line in logfile:
	m = LINE_RE.match(line.strip())
	if not m:
	continue
	groups = m.groupdict()
	# Convert a file's HH:MM or HH:MM:SS timestamps to an integer number
	# for ease of arithmetic - I don't actually care about whether the
	# resulting value is minutes or seconds as long as it's consistent
	groups['timestamp'] = sum(
	(60 ** i) * int(value.lstrip('0') or 0) for i,value in
	enumerate(reversed(groups['timestamp'].split(':'))))
	loglines.append(groups)
	return loglines


	def break_lines_by_time(lines, min_separation=10, min_deltafrac=3.0, window=5):
	"""Break a set of lines into sub-sets of lines based on timestamps."""
	assert lines

	blocks = []
	line_buffer = []
	last_timestamps = deque([0]*window, maxlen=window)
	last_deltas = deque([0]*window, maxlen=window)

	for line in lines:
	prev_time = last_timestamps[-1]
	time_delta = line['timestamp'] - prev_time
	max_allowed_delta = sum(last_deltas) * (min_deltafrac / window)
	line['max_delta'] = max_allowed_delta
	line['delta'] = time_delta
	if time_delta > max_allowed_delta and time_delta > min_separation:
	if line_buffer:
	blocks.append(line_buffer)
	line_buffer = []
	last_timestamps.extend([line['timestamp']]*window)
	else:
	last_timestamps.append(line['timestamp'])
	last_deltas.append(time_delta)
	line_buffer.append(line)

	if line_buffer:
	blocks.append(line_buffer)

	return blocks


	def find_convos(logfile):
	"""Identify conversations from a given logfile."""
	lines = get_loglines(logfile)

	allfreqs = Counter()
	for line in lines:
	bits = (x.lower().strip('.?"!,;:')
	for x in line['message'].split())
	bits = (b for b in bits
	if len(b) > 2 and b[0] in string.lowercase)
	bits = (b for b in bits if not b.startswith('http'))
	allfreqs.update(bits)

	time_convos = break_lines_by_time(lines)

	docfreqs = Counter()
	for convo in time_convos:
	bits = set()
	for line in convo:
	bits.update(x.lower().strip('.?"!,:;') for x in line['message'].split())
	docfreqs.update(bits)

	for convo in time_convos:
	convofreqs = Counter()
	nickfreqs = Counter()
	for line in convo:
	#print line['timestamp'], line['nick'], "-->", line['message'], "<<>>", line['delta'], line['max_delta']
	convofreqs.update(x.lower().strip('.?"!,:;') for x in line['message'].split())
	nickfreqs.update([line['nick']])

	signifs = sorted(((float(convofreqs[k]) / (docfreqs[k] ** 2), k)
	for k in convofreqs if allfreqs[k] > 0), reverse=True)

	duration = convo[-1]['timestamp'] - convo[0]['timestamp']

	if duration > 5:
	print "="*80
	print convo[0]['timestamp'], "to", convo[-1]['timestamp'], ", duration:", duration
	print "Keywords:", ', '.join(x[1] for x in signifs[:10])
	print "Speakers:", ', '.join("%s(%d)" % (x[0], x[1]) for x in nickfreqs.most_common(10))
	print "="*80


	def main():
	filename = sys.argv[1]
	with open(filename) as f:
	find_convos(f)


	if __name__ == "__main__":
	main()