pei223/gensim_topic_model_sample.py

## gensim_topic_model_sample.py
import gensim

words_list = [["単語", "英語"], ["日本語", "野球"], ]  # 単語リストのリスト
max_document_frequency, min_document_frequency = 1.0, 0.0  # 単語の出現頻度の上限下限
words_dictionary = gensim.corpora.Dictionary(words_list)  # 単語辞書を生成
words_dictionary.filter_extremes(no_above=max_document_frequency, no_below=min_document_frequency)
corpus = [words_dictionary.doc2bow(words) for words in words_list]  # 単語辞書を使って単語リストをBoW表現にする
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    num_topics=num_topics,
    id2word=dictionary,
    random_state=99
)

num_words = 100
# トピックごとの単語上位100こ表示
for topic_num, words_by_topic in lda_model.show_topics(num_words=num_words):
    # words_per_topicはただの文字列
    print(f"topic {topic_num}:  {words_by_topic}\n\n")
for topic_num in range(lda.num_topics):
    word_weight_dict_by_topic = dict(lda.show_topic(topic_num, num_words))  # トピックごとの単語と重みの辞書
    print(word_weight_dict_by_topic)

# ドキュメントの推論
words_of_target_document = []  # 推論したいドキュメントの単語リスト
corpus_of_target_document = words_dictionary.doc2bow(words_of_target_document)  # 単語辞書を使ってBoW表現にする
print(lda_model[corpus_of_target_document])  # ドキュメントのトピック確率(形式はList[topic_num, probability])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import gensim

words_list = [["単語", "英語"], ["日本語", "野球"], ]  # 単語リストのリスト
max_document_frequency, min_document_frequency = 1.0, 0.0  # 単語の出現頻度の上限下限
words_dictionary = gensim.corpora.Dictionary(words_list)  # 単語辞書を生成
words_dictionary.filter_extremes(no_above=max_document_frequency, no_below=min_document_frequency)
corpus = [words_dictionary.doc2bow(words) for words in words_list]  # 単語辞書を使って単語リストをBoW表現にする
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    num_topics=num_topics,
    id2word=dictionary,
    random_state=99
)

num_words = 100
# トピックごとの単語上位100こ表示
for topic_num, words_by_topic in lda_model.show_topics(num_words=num_words):
    # words_per_topicはただの文字列
    print(f"topic {topic_num}:  {words_by_topic}\n\n")
for topic_num in range(lda.num_topics):
    word_weight_dict_by_topic = dict(lda.show_topic(topic_num, num_words))  # トピックごとの単語と重みの辞書
    print(word_weight_dict_by_topic)

# ドキュメントの推論
words_of_target_document = []  # 推論したいドキュメントの単語リスト
corpus_of_target_document = words_dictionary.doc2bow(words_of_target_document)  # 単語辞書を使ってBoW表現にする
print(lda_model[corpus_of_target_document])  # ドキュメントのトピック確率(形式はList[topic_num, probability])
	import gensim

	words_list = [["単語", "英語"], ["日本語", "野球"], ] # 単語リストのリスト
	max_document_frequency, min_document_frequency = 1.0, 0.0 # 単語の出現頻度の上限下限
	words_dictionary = gensim.corpora.Dictionary(words_list) # 単語辞書を生成
	words_dictionary.filter_extremes(no_above=max_document_frequency, no_below=min_document_frequency)
	corpus = [words_dictionary.doc2bow(words) for words in words_list] # 単語辞書を使って単語リストをBoW表現にする
	lda_model = gensim.models.ldamodel.LdaModel(
	corpus=corpus,
	num_topics=num_topics,
	id2word=dictionary,
	random_state=99
	)

	num_words = 100
	# トピックごとの単語上位100こ表示
	for topic_num, words_by_topic in lda_model.show_topics(num_words=num_words):
	# words_per_topicはただの文字列
	print(f"topic {topic_num}: {words_by_topic}\n\n")
	for topic_num in range(lda.num_topics):
	word_weight_dict_by_topic = dict(lda.show_topic(topic_num, num_words)) # トピックごとの単語と重みの辞書
	print(word_weight_dict_by_topic)

	# ドキュメントの推論
	words_of_target_document = [] # 推論したいドキュメントの単語リスト
	corpus_of_target_document = words_dictionary.doc2bow(words_of_target_document) # 単語辞書を使ってBoW表現にする
	print(lda_model[corpus_of_target_document]) # ドキュメントのトピック確率(形式はList[topic_num, probability])
	1
	2
	3
	4
	5
	6
	7
	8
	9
	10
	11
	12
	13
	14
	15
	16
	17
	18
	19
	20
	21
	22
	23
	24
	25
	26
	27
	import gensim

	words_list = [["単語", "英語"], ["日本語", "野球"], ] # 単語リストのリスト
	max_document_frequency, min_document_frequency = 1.0, 0.0 # 単語の出現頻度の上限下限
	words_dictionary = gensim.corpora.Dictionary(words_list) # 単語辞書を生成
	words_dictionary.filter_extremes(no_above=max_document_frequency, no_below=min_document_frequency)
	corpus = [words_dictionary.doc2bow(words) for words in words_list] # 単語辞書を使って単語リストをBoW表現にする
	lda_model = gensim.models.ldamodel.LdaModel(
	corpus=corpus,
	num_topics=num_topics,
	id2word=dictionary,
	random_state=99
	)

	num_words = 100
	# トピックごとの単語上位100こ表示
	for topic_num, words_by_topic in lda_model.show_topics(num_words=num_words):
	# words_per_topicはただの文字列
	print(f"topic {topic_num}: {words_by_topic}\n\n")
	for topic_num in range(lda.num_topics):
	word_weight_dict_by_topic = dict(lda.show_topic(topic_num, num_words)) # トピックごとの単語と重みの辞書
	print(word_weight_dict_by_topic)

	# ドキュメントの推論
	words_of_target_document = [] # 推論したいドキュメントの単語リスト
	corpus_of_target_document = words_dictionary.doc2bow(words_of_target_document) # 単語辞書を使ってBoW表現にする
	print(lda_model[corpus_of_target_document]) # ドキュメントのトピック確率(形式はList[topic_num, probability])