Damian89/simple-tfidf-count-lsi-example.py

## simple-tfidf-count-lsi-example.py
#!/usr/bin/env python
# coding: utf8

from sklearn import feature_extraction, decomposition

stoplist = []

docs = [
    "Maschinelles lernen ist eine Disziplien die irgendwas mit Künstlicher Intelligenz zu tun hat",
    "Künstliche Intelligenz ist ein interessantes Themengebiet",
    "Deep Learning gehört ebenfalls irgendwie zum Gebiet maschinelles Lernen sowie Künstliche Intelligenz",
    "Zwei Elefanten unterscheiden sich von zwei Affen",
    "Was Affen ähnlich zu Menschen macht, ist gut bekannt",
    "Elefanten sind, wie Affen, einfach nur Tiere."
]

def print_topic_cloud(components, features, words):
    for topic in components:
        print(' '.join(features[index] for index in topic.argsort()[:-words-1:-1]))

print("TF-IDF + tSVD")
tfidf_vect = feature_extraction.text.TfidfVectorizer(stop_words=stoplist)
tfidf_vect.fit(docs)
features = tfidf_vect.get_feature_names()

tfidf = tfidf_vect.transform(docs)
tfidf_lsi = decomposition.TruncatedSVD(n_components=2)
tfidf_lsi.fit(tfidf)

print_topic_cloud(tfidf_lsi.components_,features,5)

print()
print("WordCount + tSVD")
count_vect = feature_extraction.text.CountVectorizer(stop_words=stoplist)
count_vect.fit(docs)
features = count_vect.get_feature_names()

count = count_vect.transform(docs)
count_lsi = decomposition.TruncatedSVD(n_components=2)
count_lsi.fit(count)

print_topic_cloud(count_lsi.components_,features,5)
	#!/usr/bin/env python
	# coding: utf8

	from sklearn import feature_extraction, decomposition

	stoplist = []

	docs = [
	"Maschinelles lernen ist eine Disziplien die irgendwas mit Künstlicher Intelligenz zu tun hat",
	"Künstliche Intelligenz ist ein interessantes Themengebiet",
	"Deep Learning gehört ebenfalls irgendwie zum Gebiet maschinelles Lernen sowie Künstliche Intelligenz",
	"Zwei Elefanten unterscheiden sich von zwei Affen",
	"Was Affen ähnlich zu Menschen macht, ist gut bekannt",
	"Elefanten sind, wie Affen, einfach nur Tiere."
	]

	def print_topic_cloud(components, features, words):
	for topic in components:
	print(' '.join(features[index] for index in topic.argsort()[:-words-1:-1]))

	print("TF-IDF + tSVD")
	tfidf_vect = feature_extraction.text.TfidfVectorizer(stop_words=stoplist)
	tfidf_vect.fit(docs)
	features = tfidf_vect.get_feature_names()

	tfidf = tfidf_vect.transform(docs)
	tfidf_lsi = decomposition.TruncatedSVD(n_components=2)
	tfidf_lsi.fit(tfidf)

	print_topic_cloud(tfidf_lsi.components_,features,5)

	print()
	print("WordCount + tSVD")
	count_vect = feature_extraction.text.CountVectorizer(stop_words=stoplist)
	count_vect.fit(docs)
	features = count_vect.get_feature_names()

	count = count_vect.transform(docs)
	count_lsi = decomposition.TruncatedSVD(n_components=2)
	count_lsi.fit(count)

	print_topic_cloud(count_lsi.components_,features,5)