Skip to content

Instantly share code, notes, and snippets.

@jsundram
Created March 30, 2012 14:53
Show Gist options
  • Save jsundram/2252081 to your computer and use it in GitHub Desktop.
Save jsundram/2252081 to your computer and use it in GitHub Desktop.
print wordle.net-type output for some input sentences
import string
import nltk
from collections import defaultdict
stopwords = set(nltk.corpus.stopwords.words('english'))
def get_sentences():
sentences = []
with open('/Volumes/data-1/files/topicana/cs_nps_comments_cat.txt') as f:
for line in f:
sentences.append(line.split('\t')[-1])
return sentences
def print_tagcloud(summary, n=1000):
for (t, i) in sorted(summary.iteritems(), key=lambda(k,v):(v,k), reverse=True)[:n]:
print "%s: %d" % (t, i)
def summarize(sentences):
count = defaultdict(int)
tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
for sentence in sentences:
nopunc = sentence.translate(string.maketrans("", ""), string.punctuation)
lower = nopunc.lower()
for token in tokenizer.tokenize(lower):
if token not in stopwords:
token = lemmatizer.lemmatize(token)
count[token] += 1
return count
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment