Skip to content

Instantly share code, notes, and snippets.

@deeplook
Created October 21, 2011 13:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save deeplook/1303845 to your computer and use it in GitHub Desktop.
Save deeplook/1303845 to your computer and use it in GitHub Desktop.
Print certain stats for subject line length in mailbox files.
#!/usr/bin/env python
# _*_ coding: UTF-8 _*_
"""Print certain stats for subject line length in mailbox files.
The idea is to find out the percentage of lines up to the length
of twice the mailing list tag inside square brackets.
"""
import re
def stats(path, tag):
"Print mbox subject line stats."
archive = open(path).read()
pat = re.compile("^Betreff: (.*)$", re.M)
subjects = re.findall(pat, archive)
subjects = [s for s in subjects if tag in s]
length_counts = [0] * (max([len(s) for s in subjects]) + 1)
for k, s in enumerate(subjects):
length_counts[len(s)] += 1
# print k, len(s), s
print "Length, #strings, sum(#strings), %"
total_sum = sum([j for j in length_counts])
for i in range(len(length_counts)):
acc_sum = sum([j for j in length_counts[:i+1]])
args = (i, length_counts[i], acc_sum, 100*float(acc_sum)/total_sum)
print "%2d %2d %2d %6.2f" % args
if __name__ == "__main__":
# tag = "[Berlin-SZ] "
tag = "[Berlin-Steglitz-Zehlendorf] "
path = "2011-September.txt"
stats(path, tag)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment