Created
October 2, 2023 02:44
-
-
Save jdwebprogrammer/8dcdab14b3498d364b4725542ff39df1 to your computer and use it in GitHub Desktop.
Unsupervised ML - Topic Modeling LDA (Latent Dirichlet Allocation) Class
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.decomposition import LatentDirichletAllocation | |
class LDATopicModel: | |
def __init__(self, n_topics=5, max_features=1000): | |
self.n_topics = n_topics | |
self.max_features = max_features | |
self.vectorizer = CountVectorizer(max_features=self.max_features, stop_words='english') | |
self.lda_model = LatentDirichletAllocation(n_components=self.n_topics, random_state=42) | |
def fit_transform(self, documents): | |
# Vectorize the text data | |
document_term_matrix = self.vectorizer.fit_transform(documents) | |
# Fit the LDA model | |
self.lda_model.fit(document_term_matrix) | |
return document_term_matrix, self.lda_model | |
def display_topics(self, feature_names, n_top_words=10): | |
for topic_idx, topic in enumerate(self.lda_model.components_): | |
print(f"Topic {topic_idx + 1}:") | |
print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) | |
print() | |
# Example usage of the LDATopicModel class: | |
documents = ["Machine learning is a subfield of artificial intelligence.", | |
"Natural language processing is used in text analysis.", | |
"Deep learning models require large datasets.", | |
"Topic modeling helps discover hidden themes in text data."] | |
lda_topic_model = LDATopicModel(n_topics=3) | |
document_term_matrix, lda_model = lda_topic_model.fit_transform(documents) | |
feature_names = lda_topic_model.vectorizer.get_feature_names_out() | |
lda_topic_model.display_topics(feature_names) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment