Kudusch/all_topics.py

## all_topics.py
# -*- coding: utf-8 -*-

from bertopic import BERTopic
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
import pandas as pd
import pickle
import csv

# German stopwords from nltk, downloaded on 2024-02-23
with open("Data/stopwords_german.txt", "r") as f:
    german_stop_words = [l.strip() for l in f.readlines()]

print("Reading data ...", end = "\r")
df = pd.read_csv("Data/all_posts.csv")
docs = df["text"].tolist()
print(f"Read {df.shape[0]} lines")

print("Setting up models")
# this needs the protobuf package
sentence_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2", cache_folder = "Data", device = "cuda")

hdbscan_model = HDBSCAN(
    min_cluster_size=round(df.shape[0]/1_000),
    min_samples=20,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True
)

# Setup dimension reduction
umap_model = UMAP(
    n_neighbors=df.shape[0]/100,
    n_components=5,
    min_dist=0.0,
    metric="cosine"
)

# Create the CountVectorizer model
vectorizer_model = CountVectorizer(
    stop_words=german_stop_words + ["http", "https", "html", "co"],
    lowercase=True
)

# Create the BERTopic model
topic_model = BERTopic(
    embedding_model=sentence_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    umap_model=umap_model,
    calculate_probabilities=True,
    top_n_words=7,
    verbose=True
)

print("Embedding documents ...", end = "\r")
try:
    with open("Output/all_embedded_docs.pkl", "rb") as f:
        docs_embeddings = pickle.load(f)
        print("Embeddings read from disk")
except:
    docs_embeddings = sentence_model.encode(docs, show_progress_bar=True)
    with open("Output/all_embedded_docs.pkl", "wb") as f:
        pickle.dump(docs_embeddings, f)

print("Fitting topic model ...", end = "\r")
topics, probabilities = topic_model.fit_transform(docs, embeddings=docs_embeddings)
print(f"{len(set(topics))-1} topics found")

print("Writing topic data to disk")
with open("Output/all_topics.pkl", "wb") as f:
    pickle.dump((topics, probabilities), f)

print("Writing dtm to disk")
topic_term_matrix = topic_model.c_tf_idf_
with open("Output/dtm.pkl") as f:
    pickle.dump((topic_term_matrix))

with open("Output/all_topics.csv", "w") as f:
    writer = csv.writer(f, dialect="unix")
    writer.writerow(["uri", "topic"] + [f"topic_{t}" for t in set(topics) if t != -1])
    for i in range(len(docs)):
        writer.writerow([df.loc[i, "id"], topics[i]] + [p for p in probabilities[i]])

with open("Output/all_topic_labels.csv", "w") as f:
    f.write("topic, label")
    topic_labels = topic_model.generate_topic_labels(nr_words=7, separator=", ")
    for l in topic_labels:
        f.write(f"{l}\n")
	# -- coding: utf-8 --

	from bertopic import BERTopic
	from hdbscan import HDBSCAN
	from sentence_transformers import SentenceTransformer
	from sklearn.feature_extraction.text import CountVectorizer
	from umap import UMAP
	import pandas as pd
	import pickle
	import csv

	# German stopwords from nltk, downloaded on 2024-02-23
	with open("Data/stopwords_german.txt", "r") as f:
	german_stop_words = [l.strip() for l in f.readlines()]

	print("Reading data ...", end = "\r")
	df = pd.read_csv("Data/all_posts.csv")
	docs = df["text"].tolist()
	print(f"Read {df.shape[0]} lines")

	print("Setting up models")
	# this needs the protobuf package
	sentence_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2", cache_folder = "Data", device = "cuda")

	hdbscan_model = HDBSCAN(
	min_cluster_size=round(df.shape[0]/1_000),
	min_samples=20,
	metric="euclidean",
	cluster_selection_method="eom",
	prediction_data=True
	)

	# Setup dimension reduction
	umap_model = UMAP(
	n_neighbors=df.shape[0]/100,
	n_components=5,
	min_dist=0.0,
	metric="cosine"
	)

	# Create the CountVectorizer model
	vectorizer_model = CountVectorizer(
	stop_words=german_stop_words + ["http", "https", "html", "co"],
	lowercase=True
	)

	# Create the BERTopic model
	topic_model = BERTopic(
	embedding_model=sentence_model,
	hdbscan_model=hdbscan_model,
	vectorizer_model=vectorizer_model,
	umap_model=umap_model,
	calculate_probabilities=True,
	top_n_words=7,
	verbose=True
	)

	print("Embedding documents ...", end = "\r")
	try:
	with open("Output/all_embedded_docs.pkl", "rb") as f:
	docs_embeddings = pickle.load(f)
	print("Embeddings read from disk")
	except:
	docs_embeddings = sentence_model.encode(docs, show_progress_bar=True)
	with open("Output/all_embedded_docs.pkl", "wb") as f:
	pickle.dump(docs_embeddings, f)

	print("Fitting topic model ...", end = "\r")
	topics, probabilities = topic_model.fit_transform(docs, embeddings=docs_embeddings)
	print(f"{len(set(topics))-1} topics found")

	print("Writing topic data to disk")
	with open("Output/all_topics.pkl", "wb") as f:
	pickle.dump((topics, probabilities), f)

	print("Writing dtm to disk")
	topic_term_matrix = topic_model.c_tf_idf_
	with open("Output/dtm.pkl") as f:
	pickle.dump((topic_term_matrix))

	with open("Output/all_topics.csv", "w") as f:
	writer = csv.writer(f, dialect="unix")
	writer.writerow(["uri", "topic"] + [f"topic_{t}" for t in set(topics) if t != -1])
	for i in range(len(docs)):
	writer.writerow([df.loc[i, "id"], topics[i]] + [p for p in probabilities[i]])

	with open("Output/all_topic_labels.csv", "w") as f:
	f.write("topic, label")
	topic_labels = topic_model.generate_topic_labels(nr_words=7, separator=", ")
	for l in topic_labels:
	f.write(f"{l}\n")