Last active
December 29, 2015 01:49
-
-
Save bryancatanzaro/7595646 to your computer and use it in GitHub Desktop.
A simple script for counting recurring sentences.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import re | |
import operator | |
if __name__ == '__main__': | |
filename = sys.argv[1] | |
f = open(filename, 'r') | |
sentences = {} | |
current_sentence = '' | |
def cleanse(s): | |
#Replace newlines with spaces | |
s = re.sub('\n', ' ', s) | |
#Remove spaces at beginning and end of line | |
s = s.strip() | |
return s | |
for line in f: | |
#Save the original line just in case | |
orig_line = line | |
#Normalize the punctuation | |
#Turn ellipses into commas | |
line = re.sub('\.\.\.', ',', line) | |
#Turn any remaining sets of periods into single periods | |
line = re.sub('\.+', '.', line) | |
#Delete all manner of punctuation | |
line = re.sub('[\"\,\-\(\)]', '', line) | |
line = re.sub(ur'[\223\224]', '', line) | |
#Turn unicode apostrophes into normal ones | |
line = re.sub(ur'\222', "'", line) | |
#Turn multiple spaces into single spaces | |
line = re.sub('\s+', ' ', line) | |
#Split the line at all remaining periods | |
possible_sentences = re.split('\. ', line) | |
if len(possible_sentences) == 1: | |
#No periods found | |
current_sentence += cleanse(possible_sentences[0]) + ' ' | |
else: | |
#At least one period found | |
if current_sentence: | |
#Append the first bit of text to the current sentence, if it exists | |
possible_sentences[0] = current_sentence + cleanse(possible_sentences[0]) + ' ' | |
current_sentence = '' | |
if line[-1] != '.': | |
#If the last bit of text doesn't end with a period, it's incomplete | |
#we'll save it for later | |
current_sentence = cleanse(possible_sentences[-1]) + ' ' | |
possible_sentences = possible_sentences[:-1] | |
#For every sentence remaining | |
for s in possible_sentences: | |
#Normalize it just to be sure | |
s = cleanse(s) | |
#Add a period at the end (since they were stripped out before) | |
s += '.' | |
#If we've seen this sentence before, increment its count | |
if s in sentences: | |
sentences[s] += 1 | |
else: | |
#Add a new sentence | |
if len(s) > 3: #filter out little runts like II. | |
sentences[s] = 1 | |
sorted_sentences = sorted(sentences.iteritems(), key=operator.itemgetter(1)) | |
for s, c in reversed(sorted_sentences[-100:]): | |
print("%s : %s" % (s, c)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment