Skip to content

Instantly share code, notes, and snippets.

@mallyvai
Created February 11, 2011 00:08
Show Gist options
  • Save mallyvai/821664 to your computer and use it in GitHub Desktop.
Save mallyvai/821664 to your computer and use it in GitHub Desktop.
"""
Basic Python program for doing some stuff with the conversation format
you wanted.
--Vaibhav Mallya
"""
import sys
import nltk
sent_tokenize = nltk.tokenize.sent_tokenize
word_tokenize = nltk.tokenize.word_tokenize
stem_word = nltk.stem.porter.PorterStemmer().stem_word
stop_words = set(nltk.corpus.stopwords.words('english')) # This should be a set() - submit CR request to NLTK
def get_words(content):
sentences = sent_tokenize(content)
words = []
for sentence in sentences:
unstemmed_words = word_tokenize(sentence)
stemmed_words = [ stem_word(word) for word in unstemmed_words if word not in stop_words and stem_word(word) not in stop_words and stem_word(word) not in ".?/!"]
words.extend(stemmed_words)
return words
if __name__ == "__main__":
lines = open(sys.argv[1]).readlines()
for line in lines:
if len(line.strip()) == 0:
continue
counter = {}
content = ' '.join([content for content in line.split(",")])
words = get_words(content)
for word in words:
if word not in counter:
counter[word] = 0
counter[word] += 1
sorted_tokens = sorted(counter.keys(), key=lambda k: counter[k], reverse=True)
final_line = ','.join([sorted_tokens[0], line])
print final_line
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment