Skip to content

Instantly share code, notes, and snippets.

Avatar

Maarten Grootendorst MaartenGr

View GitHub Profile
View tfidf_embeddings.py
from bertopic import BERTopic
from sklearn.feature_extraction.text import TfidfVectorizer
# Create TF-IDF sparse matrix
vectorizer = TfidfVectorizer(min_df=5)
embeddings = vectorizer.fit_transform(docs)
# Model
model = BERTopic(stop_words="english")
topics, probabilities = model.fit_transform(docs, embeddings)
View custom_embeddings.py
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
# Prepare embeddings
sentence_model = SentenceTransformer("distilbert-base-nli-mean-tokens")
embeddings = sentence_model.encode(docs, show_progress_bar=False)
# Create topic model
model = BERTopic()
topics, probabilities = model.fit_transform(docs, embeddings)
View cv_topic_update.py
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1, 3), stop_words="english")
model.update_topics(docs, topics, vectorizer=cv)
View update_topic_representation.py
# Update topic representation by increasing n-gram range and removing english stopwords
model.update_topics(docs, topics, n_gram_range=(1, 3), stop_words="english")
View new_topic_reduction.py
from bertopic import BERTopic
model = BERTopic()
topics, probs = model.fit_transform(docs)
# Further reduce topics
new_topics, new_probs = model.reduce_topics(docs, topics, probs, nr_topics=30)
View auto_topic_reduction.py
from bertopic import BERTopic
model = BERTopic(nr_topics="auto")
View manual_topic_reduction.py
from bertopic import BERTopic
model = BERTopic(nr_topics=20)
View train.py
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data']
model = BERTopic()
topics, probs = model.fit_transform(docs)
View load.py
loaded_model = BERTopic.load("my_model")