This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from octis.models.CTM import CTM | |
from octis.dataset.dataset import Dataset | |
dataset = Dataset() | |
dataset.load_custom_dataset_from_folder("../health_dataset") | |
model = CTM(num_topics=5, inference_type='combined', bert_model=embedding) | |
output = model.train_model(dataset) | |
topics = output['topics'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# k * vocab | |
X_per_cluster = self.vectorizer_model.transform(concatenated_documents) | |
# D * vocab | |
X_origin = self.vectorizer_model.transform(origin_documents) | |
if self.word_select_method == 'tfidf_idfi': | |
socres = TFIDF_IDFi(X_per_cluster, X_origin, all_documents).socre() | |
elif self.word_select_method == 'tfidf_tfi': | |
socres = TFIDF_TFi(X_per_cluster, X_origin, all_documents).socre() | |
elif self.word_select_method == 'tfi': |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
if reduce_size: | |
# reduce embedding dimensionality | |
embeddings = UMAP(n_neighbors=15, | |
n_components=10, | |
min_dist=0.0, | |
metric='cosine', | |
random_state=42 | |
).fit_transform(embeddings) | |
# clustering, here we set 5 clusters, i.e. 5 topics |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from simcse import SimCSE | |
model = SimCSE("princeton-nlp/sup-simcse-bert-base-uncased") | |
embeddings = model.encode(sentences) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from octis.dataset.dataset import Dataset | |
# load the preprocessed dataset | |
dataset = Dataset() | |
dataset.load_custom_dataset_from_folder("../health_dataset") | |
# make sentences and token_lists | |
token_lists = dataset.get_corpus() | |
sentences = [' '.join(text_list) for text_list in token_lists] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
td: 0.78 npmi: 0.14918231549764865 cv: 0.7123101179651162 | |
Topics: { | |
0: [ | |
('game', 0.02901369643627368), | |
('team', 0.020770988781233964), | |
('playoff', 0.01725616490773988), | |
('pitcher', 0.016094534492616757), | |
('player', 0.01470956934923902), | |
('play', 0.014078737937412301), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from baselines.cetopictm import CETopicTM | |
from utils import prepare_dataset | |
dataset, sentences = prepare_dataset('20ng') | |
tm = CETopicTM(dataset=dataset, | |
topic_model='cetopic', | |
num_topics=20, | |
dim_size=5, | |
word_select_method='tfidf_idfi', |