Skip to content

Instantly share code, notes, and snippets.

View techykajal's full-sized avatar
🎯
Focusing

kajal yadav techykajal

🎯
Focusing
View GitHub Profile
mat = pd.read_csv('coherence_matrix_10.csv')
mat.reset_index(drop=True)
mat
LDA = gensim.models.ldamulticore.LdaMulticore
ldamodel = LDA.load(f"ldamodel_for_16topics_Run_10")
pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)
# Show graph
x = range(2,25)
plt.plot(x, mat['coherence_score'])
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show() # Num Topics = 4 is having highest coherence score.
mat = pd.read_csv('coherence_matrix_10.csv')
mat.reset_index(drop=True)
mat
pd.DataFrame(coherence, columns=['LDA_Model','alpha','eta','coherence_score']).to_csv('coherence_matrix_10.csv', index=False)
coherence = []
for k in range(2,25):
LDA = gensim.models.ldamulticore.LdaMulticore
ldamodel = LDA.load(f"ldamodel_for_{k}topics_Run_10")
cm = gensim.models.coherencemodel.CoherenceModel(model=ldamodel, texts=Complete_Content, dictionary=dictionary, coherence='c_v')
coherence.append((k, 'default', 'default', cm.get_coherence()))
for k in range(2,25): # Train LDA on different values of k
print('Round: '+str(k))
LDA = gensim.models.ldamulticore.LdaMulticore
ldamodel = LDA(doc_term_matrix, num_topics=k, id2word = dictionary, passes=20, iterations=100,
chunksize = 10000, eval_every = 10, random_state=20)
ldamodel.save(f"ldamodel_for_{k}topics_Run_10")
pprint(ldamodel.print_topics())
# function to load dictionary and doc to term matrix from the file
def load_dict_and_docterm_matirx(dict_path, matrix_path):
"""
This fucntion will load and return
dictionary and doc term matrix
Arguments:
dict_path: path to corpus dictionary
matrix_path: path to corpus document to term matrix
# define the function to create dictionary and document to term matrix
def create_dic_and_docterm_matrix(Complete_Content, dict_file_path, matrix_file_path):
"""
This function will create corpus dictionary and document to term matrix
Argument:
X: tokenized text corpus
dict_file_path: file path to save dictionary
matrix_file_path: file path to save matrix
returns:
# Total no. of tokens in the corpus
tokens = []
for article in DF['Updated_content']:
for word in article:
tokens.append(word)
len(tokens) # There are total 1 lac 50 thousand tokens in the corpus