Skip to content

Instantly share code, notes, and snippets.

View MaartenGr's full-sized avatar

Maarten Grootendorst MaartenGr

View GitHub Profile
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1, 3), stop_words="english")
model.update_topics(docs, topics, vectorizer=cv)
# Update topic representation by increasing n-gram range and removing english stopwords
model.update_topics(docs, topics, n_gram_range=(1, 3), stop_words="english")
from bertopic import BERTopic
model = BERTopic()
topics, probs = model.fit_transform(docs)
# Further reduce topics
new_topics, new_probs = model.reduce_topics(docs, topics, probs, nr_topics=30)
from bertopic import BERTopic
model = BERTopic(nr_topics="auto")
from bertopic import BERTopic
model = BERTopic(nr_topics=20)
model.visualize_distribution(probabilities[0])
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data']
model = BERTopic()
topics, probs = model.fit_transform(docs)
loaded_model = BERTopic.load("my_model")
from bertopic import BERTopic
model = BERTopic()
model.save("my_model")
from bertopic import BERTopic
model = BERTopic(embedding_model="xlm-r-bert-base-nli-stsb-mean-tokens")