Skip to content

Instantly share code, notes, and snippets.

@h2rashee
Created October 19, 2017 07:36
Show Gist options
  • Save h2rashee/0390a64250426c97785840baa2583f93 to your computer and use it in GitHub Desktop.
Save h2rashee/0390a64250426c97785840baa2583f93 to your computer and use it in GitHub Desktop.
Given a string, find it's concordance
# Given an arbitrary text document written in English, write a program that will
# generate a concordance, i.e. an alphabetical list of all word occurrences,
# labeled with word frequencies.
#
# Bonus: label each word with the sentence numbers in which each occurrence appeared.
from nltk.tokenize import word_tokenize
import fileinput
import nltk
# Assumptions:
# - I could consider non-case-sensitively but the problem didn't specify
# - I could have setup a Makefile or pip-compile route for my dependency
# on string processing but have elected to add it to the main for simplicity
# and speed in solving this problem
# - I am reading from stdin on a single line
# - Sentence numbers are included in the second part of the list {}
# - I'm considering tokenizing the sentence as a trivial problem and using nltk
# as a library to process that for me instead of manually tokenizing it myself
# - I'm not handling the use case of i.e. since it involves
OUTPUT_FORMAT = "{}\t{{{}:{}}}"
def find_word_frequencies(text):
text_tokenized = word_tokenize(text)
sentence_count = 0
word_list = {}
for word in text_tokenized:
# Universal separator demarcating a sentence
if word == '.':
sentence_count = sentence_count + 1
else:
if word in word_list:
# Modify the word list entry if we've already seen it
count = word_list[word][0] + 1
sent_count = word_list[word][1]
# Don't add the sentence index if we've already seen it in this sentence
if sentence_count not in word_list[word][1]:
sent_count.append(sentence_count)
word_list[word] = [count, sent_count]
else:
# Create a new entry if we've never seen it before
word_list[word] = [1, [sentence_count]]
# We use the in-built sort to order the keys in alphabetical order
sorted(word_list)
# Output every word we've seen in the desired format
for word in word_list:
print OUTPUT_FORMAT.format(word,
word_list[word][0],
','.join(str(x) for x in word_list[word][1]))
if __name__== "__main__":
nltk.download('punkt')
inp = raw_input("")
find_word_frequencies(inp)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment