MaartenGr/top_words.py

## top_words.py
# Create bag of words
count_vectorizer = CountVectorizer().fit(docs_per_class.Document)
count = count_vectorizer.transform(docs_per_class.Document)
words = count_vectorizer.get_feature_names()

# Extract top 10 words per class
ctfidf = CTFIDFVectorizer().fit_transform(count, n_samples=len(docs)).toarray()
words_per_class = {newsgroups.target_names[label]: [words[index] for index in ctfidf[label].argsort()[-10:]]
                   for label in docs_per_class.Class}
	# Create bag of words
	count_vectorizer = CountVectorizer().fit(docs_per_class.Document)
	count = count_vectorizer.transform(docs_per_class.Document)
	words = count_vectorizer.get_feature_names()

	# Extract top 10 words per class
	ctfidf = CTFIDFVectorizer().fit_transform(count, n_samples=len(docs)).toarray()
	words_per_class = {newsgroups.target_names[label]: [words[index] for index in ctfidf[label].argsort()[-10:]]
	for label in docs_per_class.Class}