Skip to content

Instantly share code, notes, and snippets.

@Damian89
Created October 18, 2017 18:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Damian89/681a326c5af463dcc45fe35913f10fb9 to your computer and use it in GitHub Desktop.
Save Damian89/681a326c5af463dcc45fe35913f10fb9 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf8
from sklearn import feature_extraction, decomposition
stoplist = []
docs = [
"Maschinelles lernen ist eine Disziplien die irgendwas mit Künstlicher Intelligenz zu tun hat",
"Künstliche Intelligenz ist ein interessantes Themengebiet",
"Deep Learning gehört ebenfalls irgendwie zum Gebiet maschinelles Lernen sowie Künstliche Intelligenz",
"Zwei Elefanten unterscheiden sich von zwei Affen",
"Was Affen ähnlich zu Menschen macht, ist gut bekannt",
"Elefanten sind, wie Affen, einfach nur Tiere."
]
def print_topic_cloud(components, features, words):
for topic in components:
print(' '.join(features[index] for index in topic.argsort()[:-words-1:-1]))
print("TF-IDF + tSVD")
tfidf_vect = feature_extraction.text.TfidfVectorizer(stop_words=stoplist)
tfidf_vect.fit(docs)
features = tfidf_vect.get_feature_names()
tfidf = tfidf_vect.transform(docs)
tfidf_lsi = decomposition.TruncatedSVD(n_components=2)
tfidf_lsi.fit(tfidf)
print_topic_cloud(tfidf_lsi.components_,features,5)
print()
print("WordCount + tSVD")
count_vect = feature_extraction.text.CountVectorizer(stop_words=stoplist)
count_vect.fit(docs)
features = count_vect.get_feature_names()
count = count_vect.transform(docs)
count_lsi = decomposition.TruncatedSVD(n_components=2)
count_lsi.fit(count)
print_topic_cloud(count_lsi.components_,features,5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment