h2rashee/concordance.py

## concordance.py
# Given an arbitrary text document written in English, write a program that will
# generate a concordance, i.e. an alphabetical list of all word occurrences,
# labeled with word frequencies.
#
# Bonus: label each word with the sentence numbers in which each occurrence appeared.

from nltk.tokenize import word_tokenize
import fileinput
import nltk

# Assumptions:
# - I could consider non-case-sensitively but the problem didn't specify
# - I could have setup a Makefile or pip-compile route for my dependency
# on string processing but have elected to add it to the main for simplicity
# and speed in solving this problem
# - I am reading from stdin on a single line
# - Sentence numbers are included in the second part of the list {}
# - I'm considering tokenizing the sentence as a trivial problem and using nltk
# as a library to process that for me instead of manually tokenizing it myself
# - I'm not handling the use case of i.e. since it involves

OUTPUT_FORMAT = "{}\t{{{}:{}}}"


def find_word_frequencies(text):
    text_tokenized = word_tokenize(text)
    sentence_count = 0
    word_list = {}
    for word in text_tokenized:
        # Universal separator demarcating a sentence
        if word == '.':
            sentence_count = sentence_count + 1
        else:
            if word in word_list:
                # Modify the word list entry if we've already seen it
                count = word_list[word][0] + 1
                sent_count = word_list[word][1]
                # Don't add the sentence index if we've already seen it in this sentence
                if sentence_count not in word_list[word][1]:
                    sent_count.append(sentence_count)
                word_list[word] = [count, sent_count]
            else:
                # Create a new entry if we've never seen it before
                word_list[word] = [1, [sentence_count]]

    # We use the in-built sort to order the keys in alphabetical order
    sorted(word_list)

    # Output every word we've seen in the desired format
    for word in word_list:
        print OUTPUT_FORMAT.format(word,
                                   word_list[word][0],
                                   ','.join(str(x) for x in word_list[word][1]))


if __name__== "__main__":
    nltk.download('punkt')
    inp = raw_input("")
    find_word_frequencies(inp)
	# Given an arbitrary text document written in English, write a program that will
	# generate a concordance, i.e. an alphabetical list of all word occurrences,
	# labeled with word frequencies.
	#
	# Bonus: label each word with the sentence numbers in which each occurrence appeared.

	from nltk.tokenize import word_tokenize
	import fileinput
	import nltk

	# Assumptions:
	# - I could consider non-case-sensitively but the problem didn't specify
	# - I could have setup a Makefile or pip-compile route for my dependency
	# on string processing but have elected to add it to the main for simplicity
	# and speed in solving this problem
	# - I am reading from stdin on a single line
	# - Sentence numbers are included in the second part of the list {}
	# - I'm considering tokenizing the sentence as a trivial problem and using nltk
	# as a library to process that for me instead of manually tokenizing it myself
	# - I'm not handling the use case of i.e. since it involves

	OUTPUT_FORMAT = "{}\t{{{}:{}}}"


	def find_word_frequencies(text):
	text_tokenized = word_tokenize(text)
	sentence_count = 0
	word_list = {}
	for word in text_tokenized:
	# Universal separator demarcating a sentence
	if word == '.':
	sentence_count = sentence_count + 1
	else:
	if word in word_list:
	# Modify the word list entry if we've already seen it
	count = word_list[word][0] + 1
	sent_count = word_list[word][1]
	# Don't add the sentence index if we've already seen it in this sentence
	if sentence_count not in word_list[word][1]:
	sent_count.append(sentence_count)
	word_list[word] = [count, sent_count]
	else:
	# Create a new entry if we've never seen it before
	word_list[word] = [1, [sentence_count]]

	# We use the in-built sort to order the keys in alphabetical order
	sorted(word_list)

	# Output every word we've seen in the desired format
	for word in word_list:
	print OUTPUT_FORMAT.format(word,
	word_list[word][0],
	','.join(str(x) for x in word_list[word][1]))


	if __name__== "__main__":
	nltk.download('punkt')
	inp = raw_input("")
	find_word_frequencies(inp)