Skip to content

Instantly share code, notes, and snippets.

@pei223
Created February 6, 2020 13:36
Show Gist options
  • Save pei223/bc4df0849ba67cc2e80385af0e9ac494 to your computer and use it in GitHub Desktop.
Save pei223/bc4df0849ba67cc2e80385af0e9ac494 to your computer and use it in GitHub Desktop.
import gensim
words_list = [["単語", "英語"], ["日本語", "野球"], ] # 単語リストのリスト
max_document_frequency, min_document_frequency = 1.0, 0.0 # 単語の出現頻度の上限下限
words_dictionary = gensim.corpora.Dictionary(words_list) # 単語辞書を生成
words_dictionary.filter_extremes(no_above=max_document_frequency, no_below=min_document_frequency)
corpus = [words_dictionary.doc2bow(words) for words in words_list] # 単語辞書を使って単語リストをBoW表現にする
lda_model = gensim.models.ldamodel.LdaModel(
corpus=corpus,
num_topics=num_topics,
id2word=dictionary,
random_state=99
)
num_words = 100
# トピックごとの単語上位100こ表示
for topic_num, words_by_topic in lda_model.show_topics(num_words=num_words):
# words_per_topicはただの文字列
print(f"topic {topic_num}: {words_by_topic}\n\n")
for topic_num in range(lda.num_topics):
word_weight_dict_by_topic = dict(lda.show_topic(topic_num, num_words)) # トピックごとの単語と重みの辞書
print(word_weight_dict_by_topic)
# ドキュメントの推論
words_of_target_document = [] # 推論したいドキュメントの単語リスト
corpus_of_target_document = words_dictionary.doc2bow(words_of_target_document) # 単語辞書を使ってBoW表現にする
print(lda_model[corpus_of_target_document]) # ドキュメントのトピック確率(形式はList[topic_num, probability])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import gensim
words_list = [["単語", "英語"], ["日本語", "野球"], ] # 単語リストのリスト
max_document_frequency, min_document_frequency = 1.0, 0.0 # 単語の出現頻度の上限下限
words_dictionary = gensim.corpora.Dictionary(words_list) # 単語辞書を生成
words_dictionary.filter_extremes(no_above=max_document_frequency, no_below=min_document_frequency)
corpus = [words_dictionary.doc2bow(words) for words in words_list] # 単語辞書を使って単語リストをBoW表現にする
lda_model = gensim.models.ldamodel.LdaModel(
corpus=corpus,
num_topics=num_topics,
id2word=dictionary,
random_state=99
)
num_words = 100
# トピックごとの単語上位100こ表示
for topic_num, words_by_topic in lda_model.show_topics(num_words=num_words):
# words_per_topicはただの文字列
print(f"topic {topic_num}: {words_by_topic}\n\n")
for topic_num in range(lda.num_topics):
word_weight_dict_by_topic = dict(lda.show_topic(topic_num, num_words)) # トピックごとの単語と重みの辞書
print(word_weight_dict_by_topic)
# ドキュメントの推論
words_of_target_document = [] # 推論したいドキュメントの単語リスト
corpus_of_target_document = words_dictionary.doc2bow(words_of_target_document) # 単語辞書を使ってBoW表現にする
print(lda_model[corpus_of_target_document]) # ドキュメントのトピック確率(形式はList[topic_num, probability])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment