Created
February 6, 2020 13:36
-
-
Save pei223/bc4df0849ba67cc2e80385af0e9ac494 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gensim | |
words_list = [["単語", "英語"], ["日本語", "野球"], ] # 単語リストのリスト | |
max_document_frequency, min_document_frequency = 1.0, 0.0 # 単語の出現頻度の上限下限 | |
words_dictionary = gensim.corpora.Dictionary(words_list) # 単語辞書を生成 | |
words_dictionary.filter_extremes(no_above=max_document_frequency, no_below=min_document_frequency) | |
corpus = [words_dictionary.doc2bow(words) for words in words_list] # 単語辞書を使って単語リストをBoW表現にする | |
lda_model = gensim.models.ldamodel.LdaModel( | |
corpus=corpus, | |
num_topics=num_topics, | |
id2word=dictionary, | |
random_state=99 | |
) | |
num_words = 100 | |
# トピックごとの単語上位100こ表示 | |
for topic_num, words_by_topic in lda_model.show_topics(num_words=num_words): | |
# words_per_topicはただの文字列 | |
print(f"topic {topic_num}: {words_by_topic}\n\n") | |
for topic_num in range(lda.num_topics): | |
word_weight_dict_by_topic = dict(lda.show_topic(topic_num, num_words)) # トピックごとの単語と重みの辞書 | |
print(word_weight_dict_by_topic) | |
# ドキュメントの推論 | |
words_of_target_document = [] # 推論したいドキュメントの単語リスト | |
corpus_of_target_document = words_dictionary.doc2bow(words_of_target_document) # 単語辞書を使ってBoW表現にする | |
print(lda_model[corpus_of_target_document]) # ドキュメントのトピック確率(形式はList[topic_num, probability]) | |
1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 | |
15 | |
16 | |
17 | |
18 | |
19 | |
20 | |
21 | |
22 | |
23 | |
24 | |
25 | |
26 | |
27 | |
import gensim | |
words_list = [["単語", "英語"], ["日本語", "野球"], ] # 単語リストのリスト | |
max_document_frequency, min_document_frequency = 1.0, 0.0 # 単語の出現頻度の上限下限 | |
words_dictionary = gensim.corpora.Dictionary(words_list) # 単語辞書を生成 | |
words_dictionary.filter_extremes(no_above=max_document_frequency, no_below=min_document_frequency) | |
corpus = [words_dictionary.doc2bow(words) for words in words_list] # 単語辞書を使って単語リストをBoW表現にする | |
lda_model = gensim.models.ldamodel.LdaModel( | |
corpus=corpus, | |
num_topics=num_topics, | |
id2word=dictionary, | |
random_state=99 | |
) | |
num_words = 100 | |
# トピックごとの単語上位100こ表示 | |
for topic_num, words_by_topic in lda_model.show_topics(num_words=num_words): | |
# words_per_topicはただの文字列 | |
print(f"topic {topic_num}: {words_by_topic}\n\n") | |
for topic_num in range(lda.num_topics): | |
word_weight_dict_by_topic = dict(lda.show_topic(topic_num, num_words)) # トピックごとの単語と重みの辞書 | |
print(word_weight_dict_by_topic) | |
# ドキュメントの推論 | |
words_of_target_document = [] # 推論したいドキュメントの単語リスト | |
corpus_of_target_document = words_dictionary.doc2bow(words_of_target_document) # 単語辞書を使ってBoW表現にする | |
print(lda_model[corpus_of_target_document]) # ドキュメントのトピック確率(形式はList[topic_num, probability]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment