Zihan Zhang ZhangzihanGit

## combinedTM.py
from octis.models.CTM import CTM
from octis.dataset.dataset import Dataset

dataset = Dataset()
dataset.load_custom_dataset_from_folder("../health_dataset")

model = CTM(num_topics=5, inference_type='combined', bert_model=embedding)
output = model.train_model(dataset)
topics = output['topics']

## select_words.py
# k * vocab
X_per_cluster = self.vectorizer_model.transform(concatenated_documents)
# D * vocab
X_origin = self.vectorizer_model.transform(origin_documents)

if self.word_select_method == 'tfidf_idfi':
    socres = TFIDF_IDFi(X_per_cluster, X_origin, all_documents).socre()
elif self.word_select_method == 'tfidf_tfi':
    socres = TFIDF_TFi(X_per_cluster, X_origin, all_documents).socre()
elif self.word_select_method == 'tfi':

## clustering.py
if reduce_size:
  # reduce embedding dimensionality
  embeddings = UMAP(n_neighbors=15,
                    n_components=10,
                    min_dist=0.0,
                    metric='cosine',
                    random_state=42
                   ).fit_transform(embeddings)

# clustering, here we set 5 clusters, i.e. 5 topics

## embedding.py
from simcse import SimCSE

model = SimCSE("princeton-nlp/sup-simcse-bert-base-uncased")
embeddings = model.encode(sentences)

## load_data.py
from octis.dataset.dataset import Dataset

# load the preprocessed dataset
dataset = Dataset()
dataset.load_custom_dataset_from_folder("../health_dataset")

# make sentences and token_lists
token_lists = dataset.get_corpus()
sentences = [' '.join(text_list) for text_list in token_lists]

## example_output.py
td: 0.78 npmi: 0.14918231549764865 cv: 0.7123101179651162

Topics: {
0: [
    ('game', 0.02901369643627368),
    ('team', 0.020770988781233964),
    ('playoff', 0.01725616490773988),
    ('pitcher', 0.016094534492616757),
    ('player', 0.01470956934923902),
    ('play', 0.014078737937412301),

## example.py
from baselines.cetopictm import CETopicTM
from utils import prepare_dataset

dataset, sentences = prepare_dataset('20ng')

tm = CETopicTM(dataset=dataset,
               topic_model='cetopic',
               num_topics=20,
               dim_size=5,
               word_select_method='tfidf_idfi',
	from octis.models.CTM import CTM
	from octis.dataset.dataset import Dataset

	dataset = Dataset()
	dataset.load_custom_dataset_from_folder("../health_dataset")

	model = CTM(num_topics=5, inference_type='combined', bert_model=embedding)
	output = model.train_model(dataset)
	topics = output['topics']
	# k * vocab
	X_per_cluster = self.vectorizer_model.transform(concatenated_documents)
	# D * vocab
	X_origin = self.vectorizer_model.transform(origin_documents)

	if self.word_select_method == 'tfidf_idfi':
	socres = TFIDF_IDFi(X_per_cluster, X_origin, all_documents).socre()
	elif self.word_select_method == 'tfidf_tfi':
	socres = TFIDF_TFi(X_per_cluster, X_origin, all_documents).socre()
	elif self.word_select_method == 'tfi':
	if reduce_size:
	# reduce embedding dimensionality
	embeddings = UMAP(n_neighbors=15,
	n_components=10,
	min_dist=0.0,
	metric='cosine',
	random_state=42
	).fit_transform(embeddings)

	# clustering, here we set 5 clusters, i.e. 5 topics
	from simcse import SimCSE

	model = SimCSE("princeton-nlp/sup-simcse-bert-base-uncased")
	embeddings = model.encode(sentences)
	from octis.dataset.dataset import Dataset

	# load the preprocessed dataset
	dataset = Dataset()
	dataset.load_custom_dataset_from_folder("../health_dataset")

	# make sentences and token_lists
	token_lists = dataset.get_corpus()
	sentences = [' '.join(text_list) for text_list in token_lists]
	td: 0.78 npmi: 0.14918231549764865 cv: 0.7123101179651162

	Topics: {
	0: [
	('game', 0.02901369643627368),
	('team', 0.020770988781233964),
	('playoff', 0.01725616490773988),
	('pitcher', 0.016094534492616757),
	('player', 0.01470956934923902),
	('play', 0.014078737937412301),
	from baselines.cetopictm import CETopicTM
	from utils import prepare_dataset

	dataset, sentences = prepare_dataset('20ng')

	tm = CETopicTM(dataset=dataset,
	topic_model='cetopic',
	num_topics=20,
	dim_size=5,
	word_select_method='tfidf_idfi',