Created
October 18, 2017 18:34
-
-
Save Damian89/681a326c5af463dcc45fe35913f10fb9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf8 | |
from sklearn import feature_extraction, decomposition | |
stoplist = [] | |
docs = [ | |
"Maschinelles lernen ist eine Disziplien die irgendwas mit Künstlicher Intelligenz zu tun hat", | |
"Künstliche Intelligenz ist ein interessantes Themengebiet", | |
"Deep Learning gehört ebenfalls irgendwie zum Gebiet maschinelles Lernen sowie Künstliche Intelligenz", | |
"Zwei Elefanten unterscheiden sich von zwei Affen", | |
"Was Affen ähnlich zu Menschen macht, ist gut bekannt", | |
"Elefanten sind, wie Affen, einfach nur Tiere." | |
] | |
def print_topic_cloud(components, features, words): | |
for topic in components: | |
print(' '.join(features[index] for index in topic.argsort()[:-words-1:-1])) | |
print("TF-IDF + tSVD") | |
tfidf_vect = feature_extraction.text.TfidfVectorizer(stop_words=stoplist) | |
tfidf_vect.fit(docs) | |
features = tfidf_vect.get_feature_names() | |
tfidf = tfidf_vect.transform(docs) | |
tfidf_lsi = decomposition.TruncatedSVD(n_components=2) | |
tfidf_lsi.fit(tfidf) | |
print_topic_cloud(tfidf_lsi.components_,features,5) | |
print() | |
print("WordCount + tSVD") | |
count_vect = feature_extraction.text.CountVectorizer(stop_words=stoplist) | |
count_vect.fit(docs) | |
features = count_vect.get_feature_names() | |
count = count_vect.transform(docs) | |
count_lsi = decomposition.TruncatedSVD(n_components=2) | |
count_lsi.fit(count) | |
print_topic_cloud(count_lsi.components_,features,5) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment