Last active
March 1, 2020 12:02
-
-
Save Lyuji282/220c828babb6257138dad977919be472 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import nagisa | |
from gensim import corpora, matutils | |
from gensim import models | |
from gensim.corpora import Dictionary | |
from gensim.models.ldamodel import LdaModel | |
def discover_latent_topics(texts, num_topics=3, is_tfidf=True): | |
texts = [nagisa.tagging(text).words for text in texts] | |
dictionary = Dictionary(texts) | |
# dictionary.filter_extremes(no_below=10, no_above=0.8) | |
corpus = [dictionary.doc2bow(text) for text in texts] | |
if is_tfidf: | |
tfidf = models.TfidfModel(corpus) | |
corpus = tfidf[corpus] | |
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics) | |
texts_with_topics = [(np.argmax([s[1] for s in lda.get_document_topics(corpus[i])]), "".join(text)) for i, text in | |
enumerate(texts)] | |
return texts_with_topics | |
if __name__ == "__main__": | |
texts = [ | |
"讃岐うどんうまい", | |
"ラーメンうまい", | |
"海の幸はまずい", | |
"貝はまずい", | |
"東京の民", | |
"東京の人" | |
] | |
topics = discover_latent_topics(texts) | |
print(topics) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment