Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save jdwebprogrammer/8dcdab14b3498d364b4725542ff39df1 to your computer and use it in GitHub Desktop.
Save jdwebprogrammer/8dcdab14b3498d364b4725542ff39df1 to your computer and use it in GitHub Desktop.
Unsupervised ML - Topic Modeling LDA (Latent Dirichlet Allocation) Class
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
class LDATopicModel:
def __init__(self, n_topics=5, max_features=1000):
self.n_topics = n_topics
self.max_features = max_features
self.vectorizer = CountVectorizer(max_features=self.max_features, stop_words='english')
self.lda_model = LatentDirichletAllocation(n_components=self.n_topics, random_state=42)
def fit_transform(self, documents):
# Vectorize the text data
document_term_matrix = self.vectorizer.fit_transform(documents)
# Fit the LDA model
self.lda_model.fit(document_term_matrix)
return document_term_matrix, self.lda_model
def display_topics(self, feature_names, n_top_words=10):
for topic_idx, topic in enumerate(self.lda_model.components_):
print(f"Topic {topic_idx + 1}:")
print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
print()
# Example usage of the LDATopicModel class:
documents = ["Machine learning is a subfield of artificial intelligence.",
"Natural language processing is used in text analysis.",
"Deep learning models require large datasets.",
"Topic modeling helps discover hidden themes in text data."]
lda_topic_model = LDATopicModel(n_topics=3)
document_term_matrix, lda_model = lda_topic_model.fit_transform(documents)
feature_names = lda_topic_model.vectorizer.get_feature_names_out()
lda_topic_model.display_topics(feature_names)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment