Skip to content

Instantly share code, notes, and snippets.

@alex2awesome
Created December 22, 2020 01:08
Show Gist options
  • Save alex2awesome/a62b338b59477343c813a380aee1d7a5 to your computer and use it in GitHub Desktop.
Save alex2awesome/a62b338b59477343c813a380aee1d7a5 to your computer and use it in GitHub Desktop.
Quick LDA with SKLearn Pipelining
import pandas as pd
from gensim.sklearn_api import LdaTransformer
from gensim.corpora import Dictionary
from sklearn.base import BaseEstimator, MetaEstimatorMixin
import re
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
class CV2BOW(BaseEstimator, MetaEstimatorMixin):
"""Transform a corpus into Bag-of-Word representation."""
def fit(self, X, y=None):
return self
def transform(self, X):
t = X.tocoo()
# (token_id, token_count)
docs = defaultdict(list)
for d, r, c in list(zip(t.data, t.row, t.col)):
docs[r].append((c, d))
docs = list(map(lambda x: docs[x], range(len(docs))))
return docs
pipeline__lda = Pipeline([
('cv', CountVectorizer(min_df=.01, max_df=0.5, stop_words='english')),
('doc2bow', CV2BOW()),
('lda', LdaTransformer(num_topics=10, iterations=50)),
])
lda = pipeline__lda.fit_transform(docs)
beta_matrix = pd.DataFrame(
data=pipeline__lda['lda'].gensim_model.expElogbeta,
columns=sorted(pipeline__lda['cv'].vocabulary_)
).T
top_words = {}
for col in beta_matrix.columns:
topic = beta_matrix[col].sort_values(ascending=False)
topic_key = '%s, %s, %s' % (topic.index[0], topic.index[1], topic.index[2])
top_words[topic_key] = list(topic.iloc[3:10].index)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment