Skip to content

Instantly share code, notes, and snippets.

@Kudusch
Created April 5, 2024 11:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Kudusch/0a0337c4b03338bbfd5b7a5d795b5538 to your computer and use it in GitHub Desktop.
Save Kudusch/0a0337c4b03338bbfd5b7a5d795b5538 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
from bertopic import BERTopic
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
import pandas as pd
import pickle
import csv
# German stopwords from nltk, downloaded on 2024-02-23
with open("Data/stopwords_german.txt", "r") as f:
german_stop_words = [l.strip() for l in f.readlines()]
print("Reading data ...", end = "\r")
df = pd.read_csv("Data/all_posts.csv")
docs = df["text"].tolist()
print(f"Read {df.shape[0]} lines")
print("Setting up models")
# this needs the protobuf package
sentence_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2", cache_folder = "Data", device = "cuda")
hdbscan_model = HDBSCAN(
min_cluster_size=round(df.shape[0]/1_000),
min_samples=20,
metric="euclidean",
cluster_selection_method="eom",
prediction_data=True
)
# Setup dimension reduction
umap_model = UMAP(
n_neighbors=df.shape[0]/100,
n_components=5,
min_dist=0.0,
metric="cosine"
)
# Create the CountVectorizer model
vectorizer_model = CountVectorizer(
stop_words=german_stop_words + ["http", "https", "html", "co"],
lowercase=True
)
# Create the BERTopic model
topic_model = BERTopic(
embedding_model=sentence_model,
hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer_model,
umap_model=umap_model,
calculate_probabilities=True,
top_n_words=7,
verbose=True
)
print("Embedding documents ...", end = "\r")
try:
with open("Output/all_embedded_docs.pkl", "rb") as f:
docs_embeddings = pickle.load(f)
print("Embeddings read from disk")
except:
docs_embeddings = sentence_model.encode(docs, show_progress_bar=True)
with open("Output/all_embedded_docs.pkl", "wb") as f:
pickle.dump(docs_embeddings, f)
print("Fitting topic model ...", end = "\r")
topics, probabilities = topic_model.fit_transform(docs, embeddings=docs_embeddings)
print(f"{len(set(topics))-1} topics found")
print("Writing topic data to disk")
with open("Output/all_topics.pkl", "wb") as f:
pickle.dump((topics, probabilities), f)
print("Writing dtm to disk")
topic_term_matrix = topic_model.c_tf_idf_
with open("Output/dtm.pkl") as f:
pickle.dump((topic_term_matrix))
with open("Output/all_topics.csv", "w") as f:
writer = csv.writer(f, dialect="unix")
writer.writerow(["uri", "topic"] + [f"topic_{t}" for t in set(topics) if t != -1])
for i in range(len(docs)):
writer.writerow([df.loc[i, "id"], topics[i]] + [p for p in probabilities[i]])
with open("Output/all_topic_labels.csv", "w") as f:
f.write("topic, label")
topic_labels = topic_model.generate_topic_labels(nr_words=7, separator=", ")
for l in topic_labels:
f.write(f"{l}\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment