A simple script for counting recurring sentences.
import sys
import re
import operator
if __name__ == '__main__':
filename = sys.argv[1]
f = open(filename, 'r')
sentences = {}
current_sentence = ''
def cleanse(s):
#Replace newlines with spaces
s = re.sub('\n', ' ', s)
#Remove spaces at beginning and end of line
s = s.strip()
return s
for line in f:
#Save the original line just in case
orig_line = line
#Normalize the punctuation
#Turn ellipses into commas
line = re.sub('\.\.\.', ',', line)
#Turn any remaining sets of periods into single periods
line = re.sub('\.+', '.', line)
#Delete all manner of punctuation
line = re.sub('[\"\,\-\(\)]', '', line)
line = re.sub(ur'[\223\224]', '', line)
#Turn unicode apostrophes into normal ones
line = re.sub(ur'\222', "'", line)
#Turn multiple spaces into single spaces
line = re.sub('\s+', ' ', line)
#Split the line at all remaining periods
possible_sentences = re.split('\. ', line)
if len(possible_sentences) == 1:
#No periods found
current_sentence += cleanse(possible_sentences[0]) + ' '
#At least one period found
if current_sentence:
#Append the first bit of text to the current sentence, if it exists
possible_sentences[0] = current_sentence + cleanse(possible_sentences[0]) + ' '
current_sentence = ''
if line[-1] != '.':
#If the last bit of text doesn't end with a period, it's incomplete
#we'll save it for later
current_sentence = cleanse(possible_sentences[-1]) + ' '
possible_sentences = possible_sentences[:-1]
#For every sentence remaining
for s in possible_sentences:
#Normalize it just to be sure
s = cleanse(s)
#Add a period at the end (since they were stripped out before)
s += '.'
#If we've seen this sentence before, increment its count
if s in sentences:
sentences[s] += 1
#Add a new sentence
if len(s) > 3: #filter out little runts like II.
sentences[s] = 1
sorted_sentences = sorted(sentences.iteritems(), key=operator.itemgetter(1))
for s, c in reversed(sorted_sentences[-100:]):
print("%s : %s" % (s, c))
