Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Lyuji282/220c828babb6257138dad977919be472 to your computer and use it in GitHub Desktop.
Save Lyuji282/220c828babb6257138dad977919be472 to your computer and use it in GitHub Desktop.
import numpy as np
import nagisa
from gensim import corpora, matutils
from gensim import models
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
def discover_latent_topics(texts, num_topics=3, is_tfidf=True):
texts = [nagisa.tagging(text).words for text in texts]
dictionary = Dictionary(texts)
# dictionary.filter_extremes(no_below=10, no_above=0.8)
corpus = [dictionary.doc2bow(text) for text in texts]
if is_tfidf:
tfidf = models.TfidfModel(corpus)
corpus = tfidf[corpus]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
texts_with_topics = [(np.argmax([s[1] for s in lda.get_document_topics(corpus[i])]), "".join(text)) for i, text in
enumerate(texts)]
return texts_with_topics
if __name__ == "__main__":
texts = [
"讃岐うどんうまい",
"ラーメンうまい",
"海の幸はまずい",
"貝はまずい",
"東京の民",
"東京の人"
]
topics = discover_latent_topics(texts)
print(topics)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment