Skip to content

Instantly share code, notes, and snippets.

@dataman-git
dataman-git / #!pip install bertopic
Created February 21, 2023 03:55
#!pip install bertopic
#!pip install bertopic
# You may want to install more depending on the transformers and language backends that you will be using. The possible installations are:
#!pip install bertopic[flair]
#!pip install bertopic[gensim]
#!pip install bertopic[spacy]
#!pip install bertopic[use]
@dataman-git
dataman-git / unseen_doc
Last active February 20, 2023 03:35
unseen_doc
# Create a new corpus, made of previously unseen documents.
other_texts = [
['champion', 'hockei', 'qualifi'],
['survey', 'tournament', 'world']
]
other_corpus = [gensim_dictionary.doc2bow(text) for text in other_texts]
unseen_doc = other_corpus[0]
vector = elda_bow[unseen_doc] # get topic probability distribution for a document
vector
@dataman-git
dataman-git / coherence_elda
Created February 20, 2023 00:33
coherence_elda
# Compute Coherence Score
from gensim.models.coherencemodel import CoherenceModel
coherence_elda_bow = CoherenceModel(model=elda_bow, texts=text_tokenized,
dictionary=gensim_dictionary, coherence='c_v')
coherence_elda = coherence_elda_bow.get_coherence()
print('\nCoherence Score: ', coherence_elda)
@dataman-git
dataman-git / elda_bow_print
Last active February 20, 2023 03:33
elda_bow_print
import pprint as pp
pp.pprint(elda_bow.print_topics())
len(elda_bow.print_topics())
@dataman-git
dataman-git / elda_tfidf
Last active February 20, 2023 03:32
elda_tfidf
from gensim.test.utils import datapath
# Save the eLDA model trained on BOW data
elda_bow_file = datapath("/content/gdrive/My Drive/data/gensim/eLDA_bow_AGnews")
elda_bow.save(elda_bow_file)
# Save the eLDA model trained on TF-IDF data
elda_tfidf_file = datapath("/content/gdrive/My Drive/data/gensim/eLDA_tfidf_AGnews")
elda_tfidf.save(elda_tfidf_file)
@dataman-git
dataman-git / elda_bow
Last active February 20, 2023 03:31
elda_bow
from gensim.corpora.dictionary import Dictionary
from gensim.models import ensemblelda
elda_bow = ensemblelda.EnsembleLda(corpus=bow_corpus, id2word=gensim_dictionary, num_topics=600)
elda_tfidf = ensemblelda.EnsembleLda(corpus=tfidf_corpus, id2word=gensim_dictionary, num_topics=600)
@dataman-git
dataman-git / pickle.load
Last active February 20, 2023 03:30
pickle.load
import pickle
# open a file, where you stored the pickled data
file = open("/content/gdrive/My Drive/data/gensim/BoW_AGnews_corpus.pkl", 'rb')
bow_corpus = pickle.load(file)
# close the file
file.close()
bow_corpus[0]
@dataman-git
dataman-git / pickle.dump
Last active February 20, 2023 03:30
pickle.dump
# You can use pickle too
# open a file, where you ant to store the data
import pickle
file = open("/content/gdrive/My Drive/data/gensim/BoW_AGnews_corpus.pkl", 'wb')
pickle.dump(bow_corpus, file)
file.close()
@dataman-git
dataman-git / MmCorpus
Last active February 20, 2023 03:29
MmCorpus
from gensim import corpora
# Save in the matrix market format
corpora.MmCorpus.serialize("/content/gdrive/My Drive/data/gensim/BoW_AGnews_corpus.mm", bow_corpus)
# Load
bow_corpus = corpora.MmCorpus("/content/gdrive/My Drive/data/gensim/BoW_AGnews_corpus.mm")
bow_corpus[0]
@dataman-git
dataman-git / dict_file
Last active February 20, 2023 03:28
dict_file
from gensim.test.utils import datapath
dict_file = datapath("/content/gdrive/My Drive/data/gensim/gensim_dictionary_AGnews")
gensim_dictionary.save(dict_file)