Created
October 19, 2017 07:36
-
-
Save h2rashee/0390a64250426c97785840baa2583f93 to your computer and use it in GitHub Desktop.
Given a string, find it's concordance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Given an arbitrary text document written in English, write a program that will | |
# generate a concordance, i.e. an alphabetical list of all word occurrences, | |
# labeled with word frequencies. | |
# | |
# Bonus: label each word with the sentence numbers in which each occurrence appeared. | |
from nltk.tokenize import word_tokenize | |
import fileinput | |
import nltk | |
# Assumptions: | |
# - I could consider non-case-sensitively but the problem didn't specify | |
# - I could have setup a Makefile or pip-compile route for my dependency | |
# on string processing but have elected to add it to the main for simplicity | |
# and speed in solving this problem | |
# - I am reading from stdin on a single line | |
# - Sentence numbers are included in the second part of the list {} | |
# - I'm considering tokenizing the sentence as a trivial problem and using nltk | |
# as a library to process that for me instead of manually tokenizing it myself | |
# - I'm not handling the use case of i.e. since it involves | |
OUTPUT_FORMAT = "{}\t{{{}:{}}}" | |
def find_word_frequencies(text): | |
text_tokenized = word_tokenize(text) | |
sentence_count = 0 | |
word_list = {} | |
for word in text_tokenized: | |
# Universal separator demarcating a sentence | |
if word == '.': | |
sentence_count = sentence_count + 1 | |
else: | |
if word in word_list: | |
# Modify the word list entry if we've already seen it | |
count = word_list[word][0] + 1 | |
sent_count = word_list[word][1] | |
# Don't add the sentence index if we've already seen it in this sentence | |
if sentence_count not in word_list[word][1]: | |
sent_count.append(sentence_count) | |
word_list[word] = [count, sent_count] | |
else: | |
# Create a new entry if we've never seen it before | |
word_list[word] = [1, [sentence_count]] | |
# We use the in-built sort to order the keys in alphabetical order | |
sorted(word_list) | |
# Output every word we've seen in the desired format | |
for word in word_list: | |
print OUTPUT_FORMAT.format(word, | |
word_list[word][0], | |
','.join(str(x) for x in word_list[word][1])) | |
if __name__== "__main__": | |
nltk.download('punkt') | |
inp = raw_input("") | |
find_word_frequencies(inp) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment