bryancatanzaro/count.py

## count.py
import sys
import re
import operator

if __name__ == '__main__':
    filename = sys.argv[1]
    f = open(filename, 'r')
    sentences = {}
    current_sentence = ''

    def cleanse(s):
        #Replace newlines with spaces
        s = re.sub('\n', ' ', s)
        #Remove spaces at beginning and end of line
        s = s.strip()
        return s

    for line in f:
        #Save the original line just in case
        orig_line = line
        #Normalize the punctuation
        #Turn ellipses into commas
        line = re.sub('\.\.\.', ',', line)
        #Turn any remaining sets of periods into single periods
        line = re.sub('\.+', '.', line)
        #Delete all manner of punctuation
        line = re.sub('[\"\,\-\(\)]', '', line)
        line = re.sub(ur'[\223\224]', '', line)
        #Turn unicode apostrophes into normal ones
        line = re.sub(ur'\222', "'", line)
        #Turn multiple spaces into single spaces
        line = re.sub('\s+', ' ', line)
        #Split the line at all remaining periods
        possible_sentences = re.split('\. ', line)
        if len(possible_sentences) == 1:
            #No periods found
            current_sentence += cleanse(possible_sentences[0]) + ' '
        else:
            #At least one period found
            if current_sentence:
                #Append the first bit of text to the current sentence, if it exists
                possible_sentences[0] = current_sentence + cleanse(possible_sentences[0]) + ' '
                current_sentence = ''
            if line[-1] != '.':
                #If the last bit of text doesn't end with a period, it's incomplete
                #we'll save it for later
                current_sentence = cleanse(possible_sentences[-1]) + ' '
                possible_sentences = possible_sentences[:-1]
            #For every sentence remaining
            for s in possible_sentences:
                #Normalize it just to be sure
                s = cleanse(s)
                #Add a period at the end (since they were stripped out before)
                s += '.'
                #If we've seen this sentence before, increment its count
                if s in sentences:
                    sentences[s] += 1
                else:
                    #Add a new sentence
                    if len(s) > 3: #filter out little runts like II.
                        sentences[s] = 1
    sorted_sentences = sorted(sentences.iteritems(), key=operator.itemgetter(1))
    for s, c in reversed(sorted_sentences[-100:]):
        print("%s : %s" % (s, c))
	import sys
	import re
	import operator

	if __name__ == '__main__':
	filename = sys.argv[1]
	f = open(filename, 'r')
	sentences = {}
	current_sentence = ''

	def cleanse(s):
	#Replace newlines with spaces
	s = re.sub('\n', ' ', s)
	#Remove spaces at beginning and end of line
	s = s.strip()
	return s

	for line in f:
	#Save the original line just in case
	orig_line = line
	#Normalize the punctuation
	#Turn ellipses into commas
	line = re.sub('\.\.\.', ',', line)
	#Turn any remaining sets of periods into single periods
	line = re.sub('\.+', '.', line)
	#Delete all manner of punctuation
	line = re.sub('[\"\,\-\(\)]', '', line)
	line = re.sub(ur'[\223\224]', '', line)
	#Turn unicode apostrophes into normal ones
	line = re.sub(ur'\222', "'", line)
	#Turn multiple spaces into single spaces
	line = re.sub('\s+', ' ', line)
	#Split the line at all remaining periods
	possible_sentences = re.split('\. ', line)
	if len(possible_sentences) == 1:
	#No periods found
	current_sentence += cleanse(possible_sentences[0]) + ' '
	else:
	#At least one period found
	if current_sentence:
	#Append the first bit of text to the current sentence, if it exists
	possible_sentences[0] = current_sentence + cleanse(possible_sentences[0]) + ' '
	current_sentence = ''
	if line[-1] != '.':
	#If the last bit of text doesn't end with a period, it's incomplete
	#we'll save it for later
	current_sentence = cleanse(possible_sentences[-1]) + ' '
	possible_sentences = possible_sentences[:-1]
	#For every sentence remaining
	for s in possible_sentences:
	#Normalize it just to be sure
	s = cleanse(s)
	#Add a period at the end (since they were stripped out before)
	s += '.'
	#If we've seen this sentence before, increment its count
	if s in sentences:
	sentences[s] += 1
	else:
	#Add a new sentence
	if len(s) > 3: #filter out little runts like II.
	sentences[s] = 1
	sorted_sentences = sorted(sentences.iteritems(), key=operator.itemgetter(1))
	for s, c in reversed(sorted_sentences[-100:]):
	print("%s : %s" % (s, c))