Skip to content

Instantly share code, notes, and snippets.

@dataman-git
dataman-git / tfidf_corpus
Last active February 20, 2023 03:28
tfidf_corpus
from gensim.models import TfidfModel
tfidf = TfidfModel(bow_corpus) #, smartirs=’npu’)
tfidf_corpus = tfidf[bow_corpus]
print(tfidf_corpus[:3])
id_words = [[(gensim_dictionary[id], count) for id, count in line] for line in bow_corpus]
print(id_words)
@dataman-git
dataman-git / bow_corpus
Last active February 20, 2023 03:27
bow_corpus
from gensim.corpora import Dictionary
gensim_dictionary = Dictionary()
bow_corpus = [gensim_dictionary.doc2bow(doc, allow_update=True) for doc in text_tokenized]
print(bow_corpus[:3])
id_words = [[(gensim_dictionary[id], count) for id, count in line] for line in bow_corpus]
print(id_words)
@dataman-git
dataman-git / text_tokenized
Last active February 20, 2023 03:26
text_tokenized
from gensim.parsing.preprocessing import preprocess_string
text_tokenized = []
for doc in train['Description']:
k = preprocess_string(doc)
text_tokenized.append(k)
text_tokenized[0:3]
@dataman-git
dataman-git / train_data
Last active February 20, 2023 03:26
train_data
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', -1)
train = pd.read_csv("/content/gdrive/My Drive/data/gensim/ag_news_train.csv")
@dataman-git
dataman-git / save_html
Last active February 18, 2023 16:35
save_html
pyLDAvis.save_html(p, "/content/gdrive/My Drive/data/gensim/gensim_LDA_AGnews.html")
@dataman-git
dataman-git / print(cosine_similarity(df, df))
Last active February 17, 2023 02:37
print(cosine_similarity(df, df))
from sklearn.metrics.pairwise import cosine_similarity
df = pd.DataFrame(cv_fit.toarray())
print(cosine_similarity(df, df))
@dataman-git
dataman-git / cv_fit.toarray()
Last active February 17, 2023 02:36
cv_fit.toarray()
cv_fit.toarray()
@dataman-git
dataman-git / cv_fit
Last active February 17, 2023 02:36
cv_fit
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
cv = CountVectorizer()
cv_fit = cv.fit_transform(doc_list)
word_list = cv.get_feature_names()
count_list = cv_fit.toarray().sum(axis=0)
@dataman-git
dataman-git / doc_list
Last active February 17, 2023 02:35
doc_list
doc_list = [
"Start spreading the news",
"You're leaving today (tell him friend)",
"I want to be a part of it, New York, New York",
"Your vagabond shoes, they are longing to stray",
"And steps around the heart of it, New York, New York"
]
@dataman-git
dataman-git / lda_bow.print_topic(i))
Created February 15, 2023 23:38
lda_bow.print_topic(i))
for i in [55,16,0]:
print("Topic", i,"is:", lda_bow.print_topic(i))