This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!pip install bertopic | |
# You may want to install more depending on the transformers and language backends that you will be using. The possible installations are: | |
#!pip install bertopic[flair] | |
#!pip install bertopic[gensim] | |
#!pip install bertopic[spacy] | |
#!pip install bertopic[use] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create a new corpus, made of previously unseen documents. | |
other_texts = [ | |
['champion', 'hockei', 'qualifi'], | |
['survey', 'tournament', 'world'] | |
] | |
other_corpus = [gensim_dictionary.doc2bow(text) for text in other_texts] | |
unseen_doc = other_corpus[0] | |
vector = elda_bow[unseen_doc] # get topic probability distribution for a document | |
vector |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Compute Coherence Score | |
from gensim.models.coherencemodel import CoherenceModel | |
coherence_elda_bow = CoherenceModel(model=elda_bow, texts=text_tokenized, | |
dictionary=gensim_dictionary, coherence='c_v') | |
coherence_elda = coherence_elda_bow.get_coherence() | |
print('\nCoherence Score: ', coherence_elda) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pprint as pp | |
pp.pprint(elda_bow.print_topics()) | |
len(elda_bow.print_topics()) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gensim.test.utils import datapath | |
# Save the eLDA model trained on BOW data | |
elda_bow_file = datapath("/content/gdrive/My Drive/data/gensim/eLDA_bow_AGnews") | |
elda_bow.save(elda_bow_file) | |
# Save the eLDA model trained on TF-IDF data | |
elda_tfidf_file = datapath("/content/gdrive/My Drive/data/gensim/eLDA_tfidf_AGnews") | |
elda_tfidf.save(elda_tfidf_file) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gensim.corpora.dictionary import Dictionary | |
from gensim.models import ensemblelda | |
elda_bow = ensemblelda.EnsembleLda(corpus=bow_corpus, id2word=gensim_dictionary, num_topics=600) | |
elda_tfidf = ensemblelda.EnsembleLda(corpus=tfidf_corpus, id2word=gensim_dictionary, num_topics=600) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pickle | |
# open a file, where you stored the pickled data | |
file = open("/content/gdrive/My Drive/data/gensim/BoW_AGnews_corpus.pkl", 'rb') | |
bow_corpus = pickle.load(file) | |
# close the file | |
file.close() | |
bow_corpus[0] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# You can use pickle too | |
# open a file, where you ant to store the data | |
import pickle | |
file = open("/content/gdrive/My Drive/data/gensim/BoW_AGnews_corpus.pkl", 'wb') | |
pickle.dump(bow_corpus, file) | |
file.close() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gensim import corpora | |
# Save in the matrix market format | |
corpora.MmCorpus.serialize("/content/gdrive/My Drive/data/gensim/BoW_AGnews_corpus.mm", bow_corpus) | |
# Load | |
bow_corpus = corpora.MmCorpus("/content/gdrive/My Drive/data/gensim/BoW_AGnews_corpus.mm") | |
bow_corpus[0] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gensim.test.utils import datapath | |
dict_file = datapath("/content/gdrive/My Drive/data/gensim/gensim_dictionary_AGnews") | |
gensim_dictionary.save(dict_file) |