jsundram/text.py

## text.py
import string
import nltk
from collections import defaultdict
stopwords = set(nltk.corpus.stopwords.words('english'))

def get_sentences():
    sentences = []
    with open('/Volumes/data-1/files/topicana/cs_nps_comments_cat.txt') as f:
        for line in f:
            sentences.append(line.split('\t')[-1])
    return sentences

def print_tagcloud(summary, n=1000):
    for (t, i) in sorted(summary.iteritems(), key=lambda(k,v):(v,k), reverse=True)[:n]:
        print "%s: %d" % (t, i)

def summarize(sentences):
    count = defaultdict(int)
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    for sentence in sentences:
        nopunc = sentence.translate(string.maketrans("", ""), string.punctuation)
        lower = nopunc.lower()
        for token in tokenizer.tokenize(lower):
            if token not in stopwords:
                token = lemmatizer.lemmatize(token)
                count[token] += 1
    return count
	import string
	import nltk
	from collections import defaultdict
	stopwords = set(nltk.corpus.stopwords.words('english'))

	def get_sentences():
	sentences = []
	with open('/Volumes/data-1/files/topicana/cs_nps_comments_cat.txt') as f:
	for line in f:
	sentences.append(line.split('\t')[-1])
	return sentences

	def print_tagcloud(summary, n=1000):
	for (t, i) in sorted(summary.iteritems(), key=lambda(k,v):(v,k), reverse=True)[:n]:
	print "%s: %d" % (t, i)

	def summarize(sentences):
	count = defaultdict(int)
	tokenizer = nltk.tokenize.WhitespaceTokenizer()
	lemmatizer = nltk.stem.WordNetLemmatizer()
	for sentence in sentences:
	nopunc = sentence.translate(string.maketrans("", ""), string.punctuation)
	lower = nopunc.lower()
	for token in tokenizer.tokenize(lower):
	if token not in stopwords:
	token = lemmatizer.lemmatize(token)
	count[token] += 1
	return count