Skip to content

Instantly share code, notes, and snippets.

View ZhangzihanGit's full-sized avatar

Zihan Zhang ZhangzihanGit

View GitHub Profile
from octis.models.CTM import CTM
from octis.dataset.dataset import Dataset
dataset = Dataset()
dataset.load_custom_dataset_from_folder("../health_dataset")
model = CTM(num_topics=5, inference_type='combined', bert_model=embedding)
output = model.train_model(dataset)
topics = output['topics']
# k * vocab
X_per_cluster = self.vectorizer_model.transform(concatenated_documents)
# D * vocab
X_origin = self.vectorizer_model.transform(origin_documents)
if self.word_select_method == 'tfidf_idfi':
socres = TFIDF_IDFi(X_per_cluster, X_origin, all_documents).socre()
elif self.word_select_method == 'tfidf_tfi':
socres = TFIDF_TFi(X_per_cluster, X_origin, all_documents).socre()
elif self.word_select_method == 'tfi':
if reduce_size:
# reduce embedding dimensionality
embeddings = UMAP(n_neighbors=15,
n_components=10,
min_dist=0.0,
metric='cosine',
random_state=42
).fit_transform(embeddings)
# clustering, here we set 5 clusters, i.e. 5 topics
from simcse import SimCSE
model = SimCSE("princeton-nlp/sup-simcse-bert-base-uncased")
embeddings = model.encode(sentences)
from octis.dataset.dataset import Dataset
# load the preprocessed dataset
dataset = Dataset()
dataset.load_custom_dataset_from_folder("../health_dataset")
# make sentences and token_lists
token_lists = dataset.get_corpus()
sentences = [' '.join(text_list) for text_list in token_lists]
td: 0.78 npmi: 0.14918231549764865 cv: 0.7123101179651162
Topics: {
0: [
('game', 0.02901369643627368),
('team', 0.020770988781233964),
('playoff', 0.01725616490773988),
('pitcher', 0.016094534492616757),
('player', 0.01470956934923902),
('play', 0.014078737937412301),
@ZhangzihanGit
ZhangzihanGit / example.py
Last active May 18, 2022 04:46
Example of using CETopic to extract topics
from baselines.cetopictm import CETopicTM
from utils import prepare_dataset
dataset, sentences = prepare_dataset('20ng')
tm = CETopicTM(dataset=dataset,
topic_model='cetopic',
num_topics=20,
dim_size=5,
word_select_method='tfidf_idfi',